]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dlc/extractor/youtube.py
Merge branch 'master' into youtube-playlist-polymer
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import random
10import re
11import time
12import traceback
13
14from .common import InfoExtractor, SearchInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28)
29from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 js_to_json,
40 mimetype2ext,
41 orderedSet,
42 parse_codecs,
43 parse_count,
44 parse_duration,
45 remove_quotes,
46 remove_start,
47 smuggle_url,
48 str_or_none,
49 str_to_int,
50 try_get,
51 unescapeHTML,
52 unified_strdate,
53 unsmuggle_url,
54 uppercase_escape,
55 url_or_none,
56 urlencode_postdata,
57)
58
59
60class YoutubeBaseInfoExtractor(InfoExtractor):
61 """Provide base functions for Youtube extractors"""
62 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
63 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
64
65 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
66 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
67 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
68
69 _NETRC_MACHINE = 'youtube'
70 # If True it will raise an error if no login info is provided
71 _LOGIN_REQUIRED = False
72
73 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
74 _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
75 _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
76
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
82 def _set_language(self):
83 self._set_cookie(
84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
85 # YouTube sets the expire time to about two months
86 expire_time=time.time() + 2 * 30 * 24 * 3600)
87
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
93 def _login(self):
94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
101 username, password = self._get_login_info()
102 # No authentication to be performed
103 if username is None:
104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
108 return True
109
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
114 if login_page is False:
115 return
116
117 login_form = self._hidden_inputs(login_page)
118
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
127 'f.req': json.dumps(f_req),
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
132 })
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
158 lookup_results = req(
159 self._LOOKUP_URL, lookup_req,
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
164
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
177
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
181
182 if challenge_results is False:
183 return
184
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
204 status = try_get(login_challenge, lambda x: x[5], compat_str)
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
265
266 check_cookie_results = self._download_webpage(
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
271
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
274 return False
275
276 return True
277
278 def _download_webpage_handle(self, *args, **kwargs):
279 query = kwargs.get('query', {}).copy()
280 kwargs['query'] = query
281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
282 *args, **compat_kwargs(kwargs))
283
284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
296 self._set_language()
297 if not self._login():
298 return
299
300
301class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
302
303 def _find_entries_in_json(self, extracted):
304 entries = []
305 c = {}
306
307 def _real_find(obj):
308 if obj is None or isinstance(obj, str):
309 return
310
311 if type(obj) is list:
312 for elem in obj:
313 _real_find(elem)
314
315 if type(obj) is dict:
316 if self._is_entry(obj):
317 entries.append(obj)
318 return
319
320 if 'continuationCommand' in obj:
321 c['continuation'] = obj
322 return
323
324 for _, o in obj.items():
325 _real_find(o)
326
327 _real_find(extracted)
328
329 return entries, try_get(c, lambda x: x["continuation"])
330
331 def _entries(self, page, playlist_id, n=1):
332 seen = []
333
334 yt_conf = {}
335 for m in re.finditer(self._YTCFG_DATA_RE, page):
336 parsed = self._parse_json(m.group(1), playlist_id,
337 transform_source=js_to_json, fatal=False)
338 if parsed:
339 yt_conf.update(parsed)
340
341 data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
342
343 # for page_num in itertools.count(1):
344 for page_num in range(n):
345 entries, continuation = self._find_entries_in_json(data_json)
346 processed = self._process_entries(entries, seen)
347
348 if not processed:
349 break
350 for entry in processed:
351 yield entry
352
353 if not continuation or not yt_conf:
354 break
355 continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
356 continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
357 if not continuation_token or not continuation_url:
358 break
359
360 count = 0
361 retries = 3
362 while count <= retries:
363 try:
364 # Downloading page may result in intermittent 5xx HTTP error
365 # that is usually worked around with a retry
366 data_json = self._download_json(
367 'https://www.youtube.com%s' % continuation_url,
368 playlist_id,
369 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
370
371 transform_source=uppercase_escape,
372 query={
373 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
374 },
375 data=bytes(json.dumps({
376 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
377 'continuation': continuation_token
378 }), encoding='utf-8'),
379 headers={
380 'Content-Type': 'application/json'
381 }
382 )
383 break
384 except ExtractorError as e:
385 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
386 count += 1
387 if count <= retries:
388 continue
389 raise
390
391 def _extract_title(self, renderer):
392 title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
393 if title:
394 return title
395 return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
396
397
398class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
399 def _is_entry(self, obj):
400 return 'videoId' in obj
401
402 def _process_entries(self, entries, seen):
403 ids_in_page = []
404 titles_in_page = []
405 for renderer in entries:
406 video_id = try_get(renderer, lambda x: x['videoId'])
407 video_title = self._extract_title(renderer)
408
409 if video_id is None or video_title is None:
410 # we do not have a videoRenderer or title extraction broke
411 continue
412
413 video_title = video_title.strip()
414
415 try:
416 idx = ids_in_page.index(video_id)
417 if video_title and not titles_in_page[idx]:
418 titles_in_page[idx] = video_title
419 except ValueError:
420 ids_in_page.append(video_id)
421 titles_in_page.append(video_title)
422
423 for video_id, video_title in zip(ids_in_page, titles_in_page):
424 yield self.url_result(video_id, 'Youtube', video_id, video_title)
425
426
427class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
428 def _is_entry(self, obj):
429 return 'playlistId' in obj
430
431 def _process_entries(self, entries, seen):
432 for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
433
434 yield self.url_result(
435 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
436
437 def _real_extract(self, url):
438 playlist_id = self._match_id(url)
439 webpage = self._download_webpage(url, playlist_id)
440 title = self._og_search_title(webpage, fatal=False)
441 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
442
443
444class YoutubeIE(YoutubeBaseInfoExtractor):
445 IE_DESC = 'YouTube.com'
446 _VALID_URL = r"""(?x)^
447 (
448 (?:https?://|//) # http(s):// or protocol-independent URL
449 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
450 (?:www\.)?deturl\.com/www\.youtube\.com/|
451 (?:www\.)?pwnyoutube\.com/|
452 (?:www\.)?hooktube\.com/|
453 (?:www\.)?yourepeat\.com/|
454 tube\.majestyc\.net/|
455 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
456 (?:(?:www|dev)\.)?invidio\.us/|
457 (?:(?:www|no)\.)?invidiou\.sh/|
458 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
459 (?:www\.)?invidious\.kabi\.tk/|
460 (?:www\.)?invidious\.13ad\.de/|
461 (?:www\.)?invidious\.mastodon\.host/|
462 (?:www\.)?invidious\.nixnet\.xyz/|
463 (?:www\.)?invidious\.drycat\.fr/|
464 (?:www\.)?tube\.poal\.co/|
465 (?:www\.)?vid\.wxzm\.sx/|
466 (?:www\.)?yewtu\.be/|
467 (?:www\.)?yt\.elukerio\.org/|
468 (?:www\.)?yt\.lelux\.fi/|
469 (?:www\.)?invidious\.ggc-project\.de/|
470 (?:www\.)?yt\.maisputain\.ovh/|
471 (?:www\.)?invidious\.13ad\.de/|
472 (?:www\.)?invidious\.toot\.koeln/|
473 (?:www\.)?invidious\.fdn\.fr/|
474 (?:www\.)?watch\.nettohikari\.com/|
475 (?:www\.)?kgg2m7yk5aybusll\.onion/|
476 (?:www\.)?qklhadlycap4cnod\.onion/|
477 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
478 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
479 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
480 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
481 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
482 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
483 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
484 (?:.*?\#/)? # handle anchor (#/) redirect urls
485 (?: # the various things that can precede the ID:
486 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
487 |(?: # or the v= param in all its forms
488 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
489 (?:\?|\#!?) # the params delimiter ? or # or #!
490 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
491 v=
492 )
493 ))
494 |(?:
495 youtu\.be| # just youtu.be/xxxx
496 vid\.plus| # or vid.plus/xxxx
497 zwearz\.com/watch| # or zwearz.com/watch/xxxx
498 )/
499 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
500 )
501 )? # all until now is optional -> you can pass the naked ID
502 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
503 (?!.*?\blist=
504 (?:
505 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
506 WL # WL are handled by the watch later IE
507 )
508 )
509 (?(1).+)? # if we found the ID, everything can follow
510 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
511 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
512 _PLAYER_INFO_RE = (
513 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
514 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
515 )
516 _formats = {
517 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
518 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
519 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
520 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
521 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
522 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
523 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
524 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
525 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
526 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
527 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
528 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
529 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
530 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
531 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
532 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
533 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
534 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
535
536
537 # 3D videos
538 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
539 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
540 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
541 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
542 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
543 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
544 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
545
546 # Apple HTTP Live Streaming
547 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
548 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
549 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
550 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
551 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
552 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
553 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
554 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
555
556 # DASH mp4 video
557 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
558 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
559 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
560 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
561 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
562 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
563 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
564 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
565 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
566 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
567 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
568 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
569
570 # Dash mp4 audio
571 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
572 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
573 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
574 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
575 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
576 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
577 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
578
579 # Dash webm
580 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
581 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
582 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
583 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
584 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
585 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
586 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
587 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
588 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
589 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
590 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
591 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
592 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
593 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
594 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
595 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
596 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
597 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
598 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
599 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
600 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
601 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
602
603 # Dash webm audio
604 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
605 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
606
607 # Dash webm audio with opus inside
608 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
609 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
610 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
611
612 # RTMP (unnamed)
613 '_rtmp': {'protocol': 'rtmp'},
614
615 # av01 video only formats sometimes served with "unknown" codecs
616 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
617 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
618 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
619 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
620 }
621 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
622
623 _GEO_BYPASS = False
624
625 IE_NAME = 'youtube'
626 _TESTS = [
627 {
628 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
629 'info_dict': {
630 'id': 'BaW_jenozKc',
631 'ext': 'mp4',
632 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
633 'uploader': 'Philipp Hagemeister',
634 'uploader_id': 'phihag',
635 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
636 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
637 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
638 'upload_date': '20121002',
639 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
640 'categories': ['Science & Technology'],
641 'tags': ['youtube-dl'],
642 'duration': 10,
643 'view_count': int,
644 'like_count': int,
645 'dislike_count': int,
646 'start_time': 1,
647 'end_time': 9,
648 }
649 },
650 {
651 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
652 'note': 'Embed-only video (#1746)',
653 'info_dict': {
654 'id': 'yZIXLfi8CZQ',
655 'ext': 'mp4',
656 'upload_date': '20120608',
657 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
658 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
659 'uploader': 'SET India',
660 'uploader_id': 'setindia',
661 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
662 'age_limit': 18,
663 }
664 },
665 {
666 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
667 'note': 'Use the first video ID in the URL',
668 'info_dict': {
669 'id': 'BaW_jenozKc',
670 'ext': 'mp4',
671 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
672 'uploader': 'Philipp Hagemeister',
673 'uploader_id': 'phihag',
674 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
675 'upload_date': '20121002',
676 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
677 'categories': ['Science & Technology'],
678 'tags': ['youtube-dl'],
679 'duration': 10,
680 'view_count': int,
681 'like_count': int,
682 'dislike_count': int,
683 },
684 'params': {
685 'skip_download': True,
686 },
687 },
688 {
689 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
690 'note': '256k DASH audio (format 141) via DASH manifest',
691 'info_dict': {
692 'id': 'a9LDPn-MO4I',
693 'ext': 'm4a',
694 'upload_date': '20121002',
695 'uploader_id': '8KVIDEO',
696 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
697 'description': '',
698 'uploader': '8KVIDEO',
699 'title': 'UHDTV TEST 8K VIDEO.mp4'
700 },
701 'params': {
702 'youtube_include_dash_manifest': True,
703 'format': '141',
704 },
705 'skip': 'format 141 not served anymore',
706 },
707 # Controversy video
708 {
709 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
710 'info_dict': {
711 'id': 'T4XJQO3qol8',
712 'ext': 'mp4',
713 'duration': 219,
714 'upload_date': '20100909',
715 'uploader': 'Amazing Atheist',
716 'uploader_id': 'TheAmazingAtheist',
717 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
718 'title': 'Burning Everyone\'s Koran',
719 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
720 }
721 },
722 # Normal age-gate video (embed allowed)
723 {
724 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
725 'info_dict': {
726 'id': 'HtVdAasjOgU',
727 'ext': 'mp4',
728 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
729 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
730 'duration': 142,
731 'uploader': 'The Witcher',
732 'uploader_id': 'WitcherGame',
733 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
734 'upload_date': '20140605',
735 'age_limit': 18,
736 },
737 },
738 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
739 {
740 'url': 'lqQg6PlCWgI',
741 'info_dict': {
742 'id': 'lqQg6PlCWgI',
743 'ext': 'mp4',
744 'duration': 6085,
745 'upload_date': '20150827',
746 'uploader_id': 'olympic',
747 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
748 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
749 'uploader': 'Olympic',
750 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
751 },
752 'params': {
753 'skip_download': 'requires avconv',
754 }
755 },
756 # Non-square pixels
757 {
758 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
759 'info_dict': {
760 'id': '_b-2C3KPAM0',
761 'ext': 'mp4',
762 'stretched_ratio': 16 / 9.,
763 'duration': 85,
764 'upload_date': '20110310',
765 'uploader_id': 'AllenMeow',
766 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
767 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
768 'uploader': '孫ᄋᄅ',
769 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
770 },
771 },
772 # url_encoded_fmt_stream_map is empty string
773 {
774 'url': 'qEJwOuvDf7I',
775 'info_dict': {
776 'id': 'qEJwOuvDf7I',
777 'ext': 'webm',
778 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
779 'description': '',
780 'upload_date': '20150404',
781 'uploader_id': 'spbelect',
782 'uploader': 'Наблюдатели Петербурга',
783 },
784 'params': {
785 'skip_download': 'requires avconv',
786 },
787 'skip': 'This live event has ended.',
788 },
789 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
790 {
791 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
792 'info_dict': {
793 'id': 'FIl7x6_3R5Y',
794 'ext': 'webm',
795 'title': 'md5:7b81415841e02ecd4313668cde88737a',
796 'description': 'md5:116377fd2963b81ec4ce64b542173306',
797 'duration': 220,
798 'upload_date': '20150625',
799 'uploader_id': 'dorappi2000',
800 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
801 'uploader': 'dorappi2000',
802 'formats': 'mincount:31',
803 },
804 'skip': 'not actual anymore',
805 },
806 # DASH manifest with segment_list
807 {
808 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
809 'md5': '8ce563a1d667b599d21064e982ab9e31',
810 'info_dict': {
811 'id': 'CsmdDsKjzN8',
812 'ext': 'mp4',
813 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
814 'uploader': 'Airtek',
815 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
816 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
817 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
818 },
819 'params': {
820 'youtube_include_dash_manifest': True,
821 'format': '135', # bestvideo
822 },
823 'skip': 'This live event has ended.',
824 },
825 {
826 # Multifeed videos (multiple cameras), URL is for Main Camera
827 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
828 'info_dict': {
829 'id': 'jqWvoWXjCVs',
830 'title': 'teamPGP: Rocket League Noob Stream',
831 'description': 'md5:dc7872fb300e143831327f1bae3af010',
832 },
833 'playlist': [{
834 'info_dict': {
835 'id': 'jqWvoWXjCVs',
836 'ext': 'mp4',
837 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
838 'description': 'md5:dc7872fb300e143831327f1bae3af010',
839 'duration': 7335,
840 'upload_date': '20150721',
841 'uploader': 'Beer Games Beer',
842 'uploader_id': 'beergamesbeer',
843 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
844 'license': 'Standard YouTube License',
845 },
846 }, {
847 'info_dict': {
848 'id': '6h8e8xoXJzg',
849 'ext': 'mp4',
850 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
851 'description': 'md5:dc7872fb300e143831327f1bae3af010',
852 'duration': 7337,
853 'upload_date': '20150721',
854 'uploader': 'Beer Games Beer',
855 'uploader_id': 'beergamesbeer',
856 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
857 'license': 'Standard YouTube License',
858 },
859 }, {
860 'info_dict': {
861 'id': 'PUOgX5z9xZw',
862 'ext': 'mp4',
863 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
864 'description': 'md5:dc7872fb300e143831327f1bae3af010',
865 'duration': 7337,
866 'upload_date': '20150721',
867 'uploader': 'Beer Games Beer',
868 'uploader_id': 'beergamesbeer',
869 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
870 'license': 'Standard YouTube License',
871 },
872 }, {
873 'info_dict': {
874 'id': 'teuwxikvS5k',
875 'ext': 'mp4',
876 'title': 'teamPGP: Rocket League Noob Stream (zim)',
877 'description': 'md5:dc7872fb300e143831327f1bae3af010',
878 'duration': 7334,
879 'upload_date': '20150721',
880 'uploader': 'Beer Games Beer',
881 'uploader_id': 'beergamesbeer',
882 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
883 'license': 'Standard YouTube License',
884 },
885 }],
886 'params': {
887 'skip_download': True,
888 },
889 'skip': 'This video is not available.',
890 },
891 {
892 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
893 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
894 'info_dict': {
895 'id': 'gVfLd0zydlo',
896 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
897 },
898 'playlist_count': 2,
899 'skip': 'Not multifeed anymore',
900 },
901 {
902 'url': 'https://vid.plus/FlRa-iH7PGw',
903 'only_matching': True,
904 },
905 {
906 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
907 'only_matching': True,
908 },
909 {
910 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
911 # Also tests cut-off URL expansion in video description (see
912 # https://github.com/ytdl-org/youtube-dl/issues/1892,
913 # https://github.com/ytdl-org/youtube-dl/issues/8164)
914 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
915 'info_dict': {
916 'id': 'lsguqyKfVQg',
917 'ext': 'mp4',
918 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
919 'alt_title': 'Dark Walk - Position Music',
920 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
921 'duration': 133,
922 'upload_date': '20151119',
923 'uploader_id': 'IronSoulElf',
924 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
925 'uploader': 'IronSoulElf',
926 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
927 'track': 'Dark Walk - Position Music',
928 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
929 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
930 },
931 'params': {
932 'skip_download': True,
933 },
934 },
935 {
936 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
937 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
938 'only_matching': True,
939 },
940 {
941 # Video with yt:stretch=17:0
942 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
943 'info_dict': {
944 'id': 'Q39EVAstoRM',
945 'ext': 'mp4',
946 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
947 'description': 'md5:ee18a25c350637c8faff806845bddee9',
948 'upload_date': '20151107',
949 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
950 'uploader': 'CH GAMER DROID',
951 },
952 'params': {
953 'skip_download': True,
954 },
955 'skip': 'This video does not exist.',
956 },
957 {
958 # Video licensed under Creative Commons
959 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
960 'info_dict': {
961 'id': 'M4gD1WSo5mA',
962 'ext': 'mp4',
963 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
964 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
965 'duration': 721,
966 'upload_date': '20150127',
967 'uploader_id': 'BerkmanCenter',
968 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
969 'uploader': 'The Berkman Klein Center for Internet & Society',
970 'license': 'Creative Commons Attribution license (reuse allowed)',
971 },
972 'params': {
973 'skip_download': True,
974 },
975 },
976 {
977 # Channel-like uploader_url
978 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
979 'info_dict': {
980 'id': 'eQcmzGIKrzg',
981 'ext': 'mp4',
982 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
983 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
984 'duration': 4060,
985 'upload_date': '20151119',
986 'uploader': 'Bernie Sanders',
987 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
988 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
989 'license': 'Creative Commons Attribution license (reuse allowed)',
990 },
991 'params': {
992 'skip_download': True,
993 },
994 },
995 {
996 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
997 'only_matching': True,
998 },
999 {
1000 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1001 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1002 'only_matching': True,
1003 },
1004 {
1005 # Rental video preview
1006 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1007 'info_dict': {
1008 'id': 'uGpuVWrhIzE',
1009 'ext': 'mp4',
1010 'title': 'Piku - Trailer',
1011 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1012 'upload_date': '20150811',
1013 'uploader': 'FlixMatrix',
1014 'uploader_id': 'FlixMatrixKaravan',
1015 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1016 'license': 'Standard YouTube License',
1017 },
1018 'params': {
1019 'skip_download': True,
1020 },
1021 'skip': 'This video is not available.',
1022 },
1023 {
1024 # YouTube Red video with episode data
1025 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1026 'info_dict': {
1027 'id': 'iqKdEhx-dD4',
1028 'ext': 'mp4',
1029 'title': 'Isolation - Mind Field (Ep 1)',
1030 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1031 'duration': 2085,
1032 'upload_date': '20170118',
1033 'uploader': 'Vsauce',
1034 'uploader_id': 'Vsauce',
1035 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1036 'series': 'Mind Field',
1037 'season_number': 1,
1038 'episode_number': 1,
1039 },
1040 'params': {
1041 'skip_download': True,
1042 },
1043 'expected_warnings': [
1044 'Skipping DASH manifest',
1045 ],
1046 },
1047 {
1048 # The following content has been identified by the YouTube community
1049 # as inappropriate or offensive to some audiences.
1050 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1051 'info_dict': {
1052 'id': '6SJNVb0GnPI',
1053 'ext': 'mp4',
1054 'title': 'Race Differences in Intelligence',
1055 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1056 'duration': 965,
1057 'upload_date': '20140124',
1058 'uploader': 'New Century Foundation',
1059 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1060 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1061 },
1062 'params': {
1063 'skip_download': True,
1064 },
1065 },
1066 {
1067 # itag 212
1068 'url': '1t24XAntNCY',
1069 'only_matching': True,
1070 },
1071 {
1072 # geo restricted to JP
1073 'url': 'sJL6WA-aGkQ',
1074 'only_matching': True,
1075 },
1076 {
1077 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1078 'only_matching': True,
1079 },
1080 {
1081 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1082 'only_matching': True,
1083 },
1084 {
1085 # DRM protected
1086 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1087 'only_matching': True,
1088 },
1089 {
1090 # Video with unsupported adaptive stream type formats
1091 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1092 'info_dict': {
1093 'id': 'Z4Vy8R84T1U',
1094 'ext': 'mp4',
1095 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1096 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1097 'duration': 433,
1098 'upload_date': '20130923',
1099 'uploader': 'Amelia Putri Harwita',
1100 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1101 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1102 'formats': 'maxcount:10',
1103 },
1104 'params': {
1105 'skip_download': True,
1106 'youtube_include_dash_manifest': False,
1107 },
1108 'skip': 'not actual anymore',
1109 },
1110 {
1111 # Youtube Music Auto-generated description
1112 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1113 'info_dict': {
1114 'id': 'MgNrAu2pzNs',
1115 'ext': 'mp4',
1116 'title': 'Voyeur Girl',
1117 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1118 'upload_date': '20190312',
1119 'uploader': 'Stephen - Topic',
1120 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1121 'artist': 'Stephen',
1122 'track': 'Voyeur Girl',
1123 'album': 'it\'s too much love to know my dear',
1124 'release_date': '20190313',
1125 'release_year': 2019,
1126 },
1127 'params': {
1128 'skip_download': True,
1129 },
1130 },
1131 {
1132 # Youtube Music Auto-generated description
1133 # Retrieve 'artist' field from 'Artist:' in video description
1134 # when it is present on youtube music video
1135 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1136 'info_dict': {
1137 'id': 'k0jLE7tTwjY',
1138 'ext': 'mp4',
1139 'title': 'Latch Feat. Sam Smith',
1140 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1141 'upload_date': '20150110',
1142 'uploader': 'Various Artists - Topic',
1143 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1144 'artist': 'Disclosure',
1145 'track': 'Latch Feat. Sam Smith',
1146 'album': 'Latch Featuring Sam Smith',
1147 'release_date': '20121008',
1148 'release_year': 2012,
1149 },
1150 'params': {
1151 'skip_download': True,
1152 },
1153 },
1154 {
1155 # Youtube Music Auto-generated description
1156 # handle multiple artists on youtube music video
1157 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1158 'info_dict': {
1159 'id': '74qn0eJSjpA',
1160 'ext': 'mp4',
1161 'title': 'Eastside',
1162 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1163 'upload_date': '20180710',
1164 'uploader': 'Benny Blanco - Topic',
1165 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1166 'artist': 'benny blanco, Halsey, Khalid',
1167 'track': 'Eastside',
1168 'album': 'Eastside',
1169 'release_date': '20180713',
1170 'release_year': 2018,
1171 },
1172 'params': {
1173 'skip_download': True,
1174 },
1175 },
1176 {
1177 # Youtube Music Auto-generated description
1178 # handle youtube music video with release_year and no release_date
1179 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1180 'info_dict': {
1181 'id': '-hcAI0g-f5M',
1182 'ext': 'mp4',
1183 'title': 'Put It On Me',
1184 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1185 'upload_date': '20180426',
1186 'uploader': 'Matt Maeson - Topic',
1187 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1188 'artist': 'Matt Maeson',
1189 'track': 'Put It On Me',
1190 'album': 'The Hearse',
1191 'release_date': None,
1192 'release_year': 2018,
1193 },
1194 'params': {
1195 'skip_download': True,
1196 },
1197 },
1198 {
1199 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1200 'only_matching': True,
1201 },
1202 {
1203 # invalid -> valid video id redirection
1204 'url': 'DJztXj2GPfl',
1205 'info_dict': {
1206 'id': 'DJztXj2GPfk',
1207 'ext': 'mp4',
1208 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1209 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1210 'upload_date': '20090125',
1211 'uploader': 'Prochorowka',
1212 'uploader_id': 'Prochorowka',
1213 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1214 'artist': 'Panjabi MC',
1215 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1216 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1217 },
1218 'params': {
1219 'skip_download': True,
1220 },
1221 },
1222 {
1223 # empty description results in an empty string
1224 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1225 'info_dict': {
1226 'id': 'x41yOUIvK2k',
1227 'ext': 'mp4',
1228 'title': 'IMG 3456',
1229 'description': '',
1230 'upload_date': '20170613',
1231 'uploader_id': 'ElevageOrVert',
1232 'uploader': 'ElevageOrVert',
1233 },
1234 'params': {
1235 'skip_download': True,
1236 },
1237 },
1238 ]
1239
1240 def __init__(self, *args, **kwargs):
1241 super(YoutubeIE, self).__init__(*args, **kwargs)
1242 self._player_cache = {}
1243
1244 def report_video_info_webpage_download(self, video_id):
1245 """Report attempt to download video info webpage."""
1246 self.to_screen('%s: Downloading video info webpage' % video_id)
1247
1248 def report_information_extraction(self, video_id):
1249 """Report attempt to extract video information."""
1250 self.to_screen('%s: Extracting video information' % video_id)
1251
1252 def report_unavailable_format(self, video_id, format):
1253 """Report extracted video URL."""
1254 self.to_screen('%s: Format %s not available' % (video_id, format))
1255
1256 def report_rtmp_download(self):
1257 """Indicate the download will use the RTMP protocol."""
1258 self.to_screen('RTMP download detected')
1259
1260 def _signature_cache_id(self, example_sig):
1261 """ Return a string representation of a signature """
1262 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1263
1264 @classmethod
1265 def _extract_player_info(cls, player_url):
1266 for player_re in cls._PLAYER_INFO_RE:
1267 id_m = re.search(player_re, player_url)
1268 if id_m:
1269 break
1270 else:
1271 raise ExtractorError('Cannot identify player %r' % player_url)
1272 return id_m.group('ext'), id_m.group('id')
1273
1274 def _extract_signature_function(self, video_id, player_url, example_sig):
1275 player_type, player_id = self._extract_player_info(player_url)
1276
1277 # Read from filesystem cache
1278 func_id = '%s_%s_%s' % (
1279 player_type, player_id, self._signature_cache_id(example_sig))
1280 assert os.path.basename(func_id) == func_id
1281
1282 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1283 if cache_spec is not None:
1284 return lambda s: ''.join(s[i] for i in cache_spec)
1285
1286 download_note = (
1287 'Downloading player %s' % player_url
1288 if self._downloader.params.get('verbose') else
1289 'Downloading %s player %s' % (player_type, player_id)
1290 )
1291 if player_type == 'js':
1292 code = self._download_webpage(
1293 player_url, video_id,
1294 note=download_note,
1295 errnote='Download of %s failed' % player_url)
1296 res = self._parse_sig_js(code)
1297 elif player_type == 'swf':
1298 urlh = self._request_webpage(
1299 player_url, video_id,
1300 note=download_note,
1301 errnote='Download of %s failed' % player_url)
1302 code = urlh.read()
1303 res = self._parse_sig_swf(code)
1304 else:
1305 assert False, 'Invalid player type %r' % player_type
1306
1307 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1308 cache_res = res(test_string)
1309 cache_spec = [ord(c) for c in cache_res]
1310
1311 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1312 return res
1313
1314 def _print_sig_code(self, func, example_sig):
1315 def gen_sig_code(idxs):
1316 def _genslice(start, end, step):
1317 starts = '' if start == 0 else str(start)
1318 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1319 steps = '' if step == 1 else (':%d' % step)
1320 return 's[%s%s%s]' % (starts, ends, steps)
1321
1322 step = None
1323 # Quelch pyflakes warnings - start will be set when step is set
1324 start = '(Never used)'
1325 for i, prev in zip(idxs[1:], idxs[:-1]):
1326 if step is not None:
1327 if i - prev == step:
1328 continue
1329 yield _genslice(start, prev, step)
1330 step = None
1331 continue
1332 if i - prev in [-1, 1]:
1333 step = i - prev
1334 start = prev
1335 continue
1336 else:
1337 yield 's[%d]' % prev
1338 if step is None:
1339 yield 's[%d]' % i
1340 else:
1341 yield _genslice(start, i, step)
1342
1343 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1344 cache_res = func(test_string)
1345 cache_spec = [ord(c) for c in cache_res]
1346 expr_code = ' + '.join(gen_sig_code(cache_spec))
1347 signature_id_tuple = '(%s)' % (
1348 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1349 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1350 ' return %s\n') % (signature_id_tuple, expr_code)
1351 self.to_screen('Extracted signature function:\n' + code)
1352
1353 def _parse_sig_js(self, jscode):
1354 funcname = self._search_regex(
1355 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1356 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1357 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1358 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1359 # Obsolete patterns
1360 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1361 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1362 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1363 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1364 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1365 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1366 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1367 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1368 jscode, 'Initial JS player signature function name', group='sig')
1369
1370 jsi = JSInterpreter(jscode)
1371 initial_function = jsi.extract_function(funcname)
1372 return lambda s: initial_function([s])
1373
1374 def _parse_sig_swf(self, file_contents):
1375 swfi = SWFInterpreter(file_contents)
1376 TARGET_CLASSNAME = 'SignatureDecipher'
1377 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1378 initial_function = swfi.extract_function(searched_class, 'decipher')
1379 return lambda s: initial_function([s])
1380
1381 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1382 """Turn the encrypted s field into a working signature"""
1383
1384 if player_url is None:
1385 raise ExtractorError('Cannot decrypt signature without player_url')
1386
1387 if player_url.startswith('//'):
1388 player_url = 'https:' + player_url
1389 elif not re.match(r'https?://', player_url):
1390 player_url = compat_urlparse.urljoin(
1391 'https://www.youtube.com', player_url)
1392 try:
1393 player_id = (player_url, self._signature_cache_id(s))
1394 if player_id not in self._player_cache:
1395 func = self._extract_signature_function(
1396 video_id, player_url, s
1397 )
1398 self._player_cache[player_id] = func
1399 func = self._player_cache[player_id]
1400 if self._downloader.params.get('youtube_print_sig_code'):
1401 self._print_sig_code(func, s)
1402 return func(s)
1403 except Exception as e:
1404 tb = traceback.format_exc()
1405 raise ExtractorError(
1406 'Signature extraction failed: ' + tb, cause=e)
1407
1408 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1409 try:
1410 subs_doc = self._download_xml(
1411 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1412 video_id, note=False)
1413 except ExtractorError as err:
1414 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1415 return {}
1416
1417 sub_lang_list = {}
1418 for track in subs_doc.findall('track'):
1419 lang = track.attrib['lang_code']
1420 if lang in sub_lang_list:
1421 continue
1422 sub_formats = []
1423 for ext in self._SUBTITLE_FORMATS:
1424 params = compat_urllib_parse_urlencode({
1425 'lang': lang,
1426 'v': video_id,
1427 'fmt': ext,
1428 'name': track.attrib['name'].encode('utf-8'),
1429 })
1430 sub_formats.append({
1431 'url': 'https://www.youtube.com/api/timedtext?' + params,
1432 'ext': ext,
1433 })
1434 sub_lang_list[lang] = sub_formats
1435 if has_live_chat_replay:
1436 sub_lang_list['live_chat'] = [
1437 {
1438 'video_id': video_id,
1439 'ext': 'json',
1440 'protocol': 'youtube_live_chat_replay',
1441 },
1442 ]
1443 if not sub_lang_list:
1444 self._downloader.report_warning('video doesn\'t have subtitles')
1445 return {}
1446 return sub_lang_list
1447
1448 def _get_ytplayer_config(self, video_id, webpage):
1449 patterns = (
1450 # User data may contain arbitrary character sequences that may affect
1451 # JSON extraction with regex, e.g. when '};' is contained the second
1452 # regex won't capture the whole JSON. Yet working around by trying more
1453 # concrete regex first keeping in mind proper quoted string handling
1454 # to be implemented in future that will replace this workaround (see
1455 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1456 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1457 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1458 r';ytplayer\.config\s*=\s*({.+?});',
1459 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
1460 )
1461 config = self._search_regex(
1462 patterns, webpage, 'ytplayer.config', default=None)
1463 if config:
1464 return self._parse_json(
1465 uppercase_escape(config), video_id, fatal=False)
1466
1467 def _get_music_metadata_from_yt_initial(self, yt_initial):
1468 music_metadata = []
1469 key_map = {
1470 'Album': 'album',
1471 'Artist': 'artist',
1472 'Song': 'track'
1473 }
1474 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1475 if type(contents) is list:
1476 for content in contents:
1477 music_track = {}
1478 if type(content) is not dict:
1479 continue
1480 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1481 if type(videoSecondaryInfoRenderer) is not dict:
1482 continue
1483 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1484 if type(rows) is not list:
1485 continue
1486 for row in rows:
1487 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1488 if type(metadataRowRenderer) is not dict:
1489 continue
1490 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1491 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1492 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1493 if type(key) is not str or type(value) is not str:
1494 continue
1495 if key in key_map:
1496 if key_map[key] in music_track:
1497 # we've started on a new track
1498 music_metadata.append(music_track)
1499 music_track = {}
1500 music_track[key_map[key]] = value
1501 if len(music_track.keys()):
1502 music_metadata.append(music_track)
1503 return music_metadata
1504
1505 def _get_automatic_captions(self, video_id, webpage):
1506 """We need the webpage for getting the captions url, pass it as an
1507 argument to speed up the process."""
1508 self.to_screen('%s: Looking for automatic captions' % video_id)
1509 player_config = self._get_ytplayer_config(video_id, webpage)
1510 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1511 if not player_config:
1512 self._downloader.report_warning(err_msg)
1513 return {}
1514 try:
1515 if "args" in player_config and "ttsurl" in player_config["args"]:
1516 args = player_config['args']
1517 caption_url = args['ttsurl']
1518 timestamp = args['timestamp']
1519
1520 # We get the available subtitles
1521 list_params = compat_urllib_parse_urlencode({
1522 'type': 'list',
1523 'tlangs': 1,
1524 'asrs': 1,
1525 })
1526 list_url = caption_url + '&' + list_params
1527 caption_list = self._download_xml(list_url, video_id)
1528 original_lang_node = caption_list.find('track')
1529 if original_lang_node is None:
1530 self._downloader.report_warning('Video doesn\'t have automatic captions')
1531 return {}
1532 original_lang = original_lang_node.attrib['lang_code']
1533 caption_kind = original_lang_node.attrib.get('kind', '')
1534
1535 sub_lang_list = {}
1536 for lang_node in caption_list.findall('target'):
1537 sub_lang = lang_node.attrib['lang_code']
1538 sub_formats = []
1539 for ext in self._SUBTITLE_FORMATS:
1540 params = compat_urllib_parse_urlencode({
1541 'lang': original_lang,
1542 'tlang': sub_lang,
1543 'fmt': ext,
1544 'ts': timestamp,
1545 'kind': caption_kind,
1546 })
1547 sub_formats.append({
1548 'url': caption_url + '&' + params,
1549 'ext': ext,
1550 })
1551 sub_lang_list[sub_lang] = sub_formats
1552 return sub_lang_list
1553
1554 def make_captions(sub_url, sub_langs):
1555 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1556 caption_qs = compat_parse_qs(parsed_sub_url.query)
1557 captions = {}
1558 for sub_lang in sub_langs:
1559 sub_formats = []
1560 for ext in self._SUBTITLE_FORMATS:
1561 caption_qs.update({
1562 'tlang': [sub_lang],
1563 'fmt': [ext],
1564 })
1565 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1566 query=compat_urllib_parse_urlencode(caption_qs, True)))
1567 sub_formats.append({
1568 'url': sub_url,
1569 'ext': ext,
1570 })
1571 captions[sub_lang] = sub_formats
1572 return captions
1573
1574 # New captions format as of 22.06.2017
1575 if "args" in player_config:
1576 player_response = player_config["args"].get('player_response')
1577 else:
1578 # New player system (ytInitialPlayerResponse) as of October 2020
1579 player_response = player_config
1580
1581 if player_response:
1582 if isinstance(player_response, compat_str):
1583 player_response = self._parse_json(
1584 player_response, video_id, fatal=False)
1585
1586 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1587 caption_tracks = renderer['captionTracks']
1588 for caption_track in caption_tracks:
1589 if 'kind' not in caption_track:
1590 # not an automatic transcription
1591 continue
1592 base_url = caption_track['baseUrl']
1593 sub_lang_list = []
1594 for lang in renderer['translationLanguages']:
1595 lang_code = lang.get('languageCode')
1596 if lang_code:
1597 sub_lang_list.append(lang_code)
1598 return make_captions(base_url, sub_lang_list)
1599
1600 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1601 return {}
1602
1603 if "args" in player_config:
1604 args = player_config["args"]
1605
1606 # Some videos don't provide ttsurl but rather caption_tracks and
1607 # caption_translation_languages (e.g. 20LmZk1hakA)
1608 # Does not used anymore as of 22.06.2017
1609 caption_tracks = args['caption_tracks']
1610 caption_translation_languages = args['caption_translation_languages']
1611 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1612 sub_lang_list = []
1613 for lang in caption_translation_languages.split(','):
1614 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1615 sub_lang = lang_qs.get('lc', [None])[0]
1616 if sub_lang:
1617 sub_lang_list.append(sub_lang)
1618 return make_captions(caption_url, sub_lang_list)
1619 # An extractor error can be raise by the download process if there are
1620 # no automatic captions but there are subtitles
1621 except (KeyError, IndexError, ExtractorError):
1622 self._downloader.report_warning(err_msg)
1623 return {}
1624
1625 def _mark_watched(self, video_id, video_info, player_response):
1626 playback_url = url_or_none(try_get(
1627 player_response,
1628 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1629 video_info, lambda x: x['videostats_playback_base_url'][0]))
1630 if not playback_url:
1631 return
1632 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1633 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1634
1635 # cpn generation algorithm is reverse engineered from base.js.
1636 # In fact it works even with dummy cpn.
1637 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1638 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1639
1640 qs.update({
1641 'ver': ['2'],
1642 'cpn': [cpn],
1643 })
1644 playback_url = compat_urlparse.urlunparse(
1645 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1646
1647 self._download_webpage(
1648 playback_url, video_id, 'Marking watched',
1649 'Unable to mark watched', fatal=False)
1650
1651 @staticmethod
1652 def _extract_urls(webpage):
1653 # Embedded YouTube player
1654 entries = [
1655 unescapeHTML(mobj.group('url'))
1656 for mobj in re.finditer(r'''(?x)
1657 (?:
1658 <iframe[^>]+?src=|
1659 data-video-url=|
1660 <embed[^>]+?src=|
1661 embedSWF\(?:\s*|
1662 <object[^>]+data=|
1663 new\s+SWFObject\(
1664 )
1665 (["\'])
1666 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1667 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1668 \1''', webpage)]
1669
1670 # lazyYT YouTube embed
1671 entries.extend(list(map(
1672 unescapeHTML,
1673 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1674
1675 # Wordpress "YouTube Video Importer" plugin
1676 matches = re.findall(r'''(?x)<div[^>]+
1677 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1678 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1679 entries.extend(m[-1] for m in matches)
1680
1681 return entries
1682
1683 @staticmethod
1684 def _extract_url(webpage):
1685 urls = YoutubeIE._extract_urls(webpage)
1686 return urls[0] if urls else None
1687
1688 @classmethod
1689 def extract_id(cls, url):
1690 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1691 if mobj is None:
1692 raise ExtractorError('Invalid URL: %s' % url)
1693 video_id = mobj.group(2)
1694 return video_id
1695
1696 def _extract_chapters_from_json(self, webpage, video_id, duration):
1697 if not webpage:
1698 return
1699 initial_data = self._parse_json(
1700 self._search_regex(
1701 r'window\["ytInitialData"\] = (.+);\n', webpage,
1702 'player args', default='{}'),
1703 video_id, fatal=False)
1704 if not initial_data or not isinstance(initial_data, dict):
1705 return
1706 chapters_list = try_get(
1707 initial_data,
1708 lambda x: x['playerOverlays']
1709 ['playerOverlayRenderer']
1710 ['decoratedPlayerBarRenderer']
1711 ['decoratedPlayerBarRenderer']
1712 ['playerBar']
1713 ['chapteredPlayerBarRenderer']
1714 ['chapters'],
1715 list)
1716 if not chapters_list:
1717 return
1718
1719 def chapter_time(chapter):
1720 return float_or_none(
1721 try_get(
1722 chapter,
1723 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1724 int),
1725 scale=1000)
1726 chapters = []
1727 for next_num, chapter in enumerate(chapters_list, start=1):
1728 start_time = chapter_time(chapter)
1729 if start_time is None:
1730 continue
1731 end_time = (chapter_time(chapters_list[next_num])
1732 if next_num < len(chapters_list) else duration)
1733 if end_time is None:
1734 continue
1735 title = try_get(
1736 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1737 compat_str)
1738 chapters.append({
1739 'start_time': start_time,
1740 'end_time': end_time,
1741 'title': title,
1742 })
1743 return chapters
1744
1745 @staticmethod
1746 def _extract_chapters_from_description(description, duration):
1747 if not description:
1748 return None
1749 chapter_lines = re.findall(
1750 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1751 description)
1752 if not chapter_lines:
1753 return None
1754 chapters = []
1755 for next_num, (chapter_line, time_point) in enumerate(
1756 chapter_lines, start=1):
1757 start_time = parse_duration(time_point)
1758 if start_time is None:
1759 continue
1760 if start_time > duration:
1761 break
1762 end_time = (duration if next_num == len(chapter_lines)
1763 else parse_duration(chapter_lines[next_num][1]))
1764 if end_time is None:
1765 continue
1766 if end_time > duration:
1767 end_time = duration
1768 if start_time > end_time:
1769 break
1770 chapter_title = re.sub(
1771 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1772 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1773 chapters.append({
1774 'start_time': start_time,
1775 'end_time': end_time,
1776 'title': chapter_title,
1777 })
1778 return chapters
1779
1780 def _extract_chapters(self, webpage, description, video_id, duration):
1781 return (self._extract_chapters_from_json(webpage, video_id, duration)
1782 or self._extract_chapters_from_description(description, duration))
1783
1784 def _real_extract(self, url):
1785 url, smuggled_data = unsmuggle_url(url, {})
1786
1787 proto = (
1788 'http' if self._downloader.params.get('prefer_insecure', False)
1789 else 'https')
1790
1791 start_time = None
1792 end_time = None
1793 parsed_url = compat_urllib_parse_urlparse(url)
1794 for component in [parsed_url.fragment, parsed_url.query]:
1795 query = compat_parse_qs(component)
1796 if start_time is None and 't' in query:
1797 start_time = parse_duration(query['t'][0])
1798 if start_time is None and 'start' in query:
1799 start_time = parse_duration(query['start'][0])
1800 if end_time is None and 'end' in query:
1801 end_time = parse_duration(query['end'][0])
1802
1803 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1804 mobj = re.search(self._NEXT_URL_RE, url)
1805 if mobj:
1806 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1807 video_id = self.extract_id(url)
1808
1809 # Get video webpage
1810 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1811 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1812
1813 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1814 video_id = qs.get('v', [None])[0] or video_id
1815
1816 # Attempt to extract SWF player URL
1817 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1818 if mobj is not None:
1819 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1820 else:
1821 player_url = None
1822
1823 dash_mpds = []
1824
1825 def add_dash_mpd(video_info):
1826 dash_mpd = video_info.get('dashmpd')
1827 if dash_mpd and dash_mpd[0] not in dash_mpds:
1828 dash_mpds.append(dash_mpd[0])
1829
1830 def add_dash_mpd_pr(pl_response):
1831 dash_mpd = url_or_none(try_get(
1832 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1833 compat_str))
1834 if dash_mpd and dash_mpd not in dash_mpds:
1835 dash_mpds.append(dash_mpd)
1836
1837 is_live = None
1838 view_count = None
1839
1840 def extract_view_count(v_info):
1841 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1842
1843 def extract_player_response(player_response, video_id):
1844 pl_response = str_or_none(player_response)
1845 if not pl_response:
1846 return
1847 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1848 if isinstance(pl_response, dict):
1849 add_dash_mpd_pr(pl_response)
1850 return pl_response
1851
1852 def extract_embedded_config(embed_webpage, video_id):
1853 embedded_config = self._search_regex(
1854 r'setConfig\(({.*})\);',
1855 embed_webpage, 'ytInitialData', default=None)
1856 if embedded_config:
1857 return embedded_config
1858
1859 player_response = {}
1860
1861 # Get video info
1862 video_info = {}
1863 embed_webpage = None
1864 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1865 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1866 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1867 age_gate = True
1868 # We simulate the access to the video from www.youtube.com/v/{video_id}
1869 # this can be viewed without login into Youtube
1870 url = proto + '://www.youtube.com/embed/%s' % video_id
1871 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1872 ext = extract_embedded_config(embed_webpage, video_id)
1873 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1874 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1875 if not playable_in_embed:
1876 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1877 playable_in_embed = ''
1878 else:
1879 playable_in_embed = playable_in_embed.group('playableinEmbed')
1880 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1881 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1882 if playable_in_embed == 'false':
1883 '''
1884 # TODO apply this patch when Support for Python 2.6(!) and above drops
1885 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1886 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1887 '''
1888 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1889 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1890 age_gate = False
1891 # Try looking directly into the video webpage
1892 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1893 if ytplayer_config:
1894 args = ytplayer_config.get("args")
1895 if args is not None:
1896 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1897 # Convert to the same format returned by compat_parse_qs
1898 video_info = dict((k, [v]) for k, v in args.items())
1899 add_dash_mpd(video_info)
1900 # Rental video is not rented but preview is available (e.g.
1901 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1902 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1903 if not video_info and args.get('ypc_vid'):
1904 return self.url_result(
1905 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1906 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1907 is_live = True
1908 if not player_response:
1909 player_response = extract_player_response(args.get('player_response'), video_id)
1910 elif not player_response:
1911 player_response = ytplayer_config
1912 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1913 add_dash_mpd_pr(player_response)
1914 else:
1915 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1916 else:
1917 data = compat_urllib_parse_urlencode({
1918 'video_id': video_id,
1919 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1920 'sts': self._search_regex(
1921 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1922 })
1923 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1924 try:
1925 video_info_webpage = self._download_webpage(
1926 video_info_url, video_id,
1927 note='Refetching age-gated info webpage',
1928 errnote='unable to download video info webpage')
1929 except ExtractorError:
1930 video_info_webpage = None
1931 if video_info_webpage:
1932 video_info = compat_parse_qs(video_info_webpage)
1933 pl_response = video_info.get('player_response', [None])[0]
1934 player_response = extract_player_response(pl_response, video_id)
1935 add_dash_mpd(video_info)
1936 view_count = extract_view_count(video_info)
1937 else:
1938 age_gate = False
1939 # Try looking directly into the video webpage
1940 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1941 args = ytplayer_config.get("args")
1942 if args is not None:
1943 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1944 # Convert to the same format returned by compat_parse_qs
1945 video_info = dict((k, [v]) for k, v in args.items())
1946 add_dash_mpd(video_info)
1947 # Rental video is not rented but preview is available (e.g.
1948 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1949 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1950 if not video_info and args.get('ypc_vid'):
1951 return self.url_result(
1952 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1953 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1954 is_live = True
1955 if not player_response:
1956 player_response = extract_player_response(args.get('player_response'), video_id)
1957 elif not player_response:
1958 player_response = ytplayer_config
1959 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1960 add_dash_mpd_pr(player_response)
1961
1962 def extract_unavailable_message():
1963 messages = []
1964 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1965 msg = self._html_search_regex(
1966 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1967 video_webpage, 'unavailable %s' % kind, default=None)
1968 if msg:
1969 messages.append(msg)
1970 if messages:
1971 return '\n'.join(messages)
1972
1973 if not video_info and not player_response:
1974 unavailable_message = extract_unavailable_message()
1975 if not unavailable_message:
1976 unavailable_message = 'Unable to extract video data'
1977 raise ExtractorError(
1978 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1979
1980 if not isinstance(video_info, dict):
1981 video_info = {}
1982
1983 video_details = try_get(
1984 player_response, lambda x: x['videoDetails'], dict) or {}
1985
1986 microformat = try_get(
1987 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1988
1989 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1990 if not video_title:
1991 self._downloader.report_warning('Unable to extract video title')
1992 video_title = '_'
1993
1994 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1995 if video_description:
1996
1997 def replace_url(m):
1998 redir_url = compat_urlparse.urljoin(url, m.group(1))
1999 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
2000 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
2001 qs = compat_parse_qs(parsed_redir_url.query)
2002 q = qs.get('q')
2003 if q and q[0]:
2004 return q[0]
2005 return redir_url
2006
2007 description_original = video_description = re.sub(r'''(?x)
2008 <a\s+
2009 (?:[a-zA-Z-]+="[^"]*"\s+)*?
2010 (?:title|href)="([^"]+)"\s+
2011 (?:[a-zA-Z-]+="[^"]*"\s+)*?
2012 class="[^"]*"[^>]*>
2013 [^<]+\.{3}\s*
2014 </a>
2015 ''', replace_url, video_description)
2016 video_description = clean_html(video_description)
2017 else:
2018 video_description = video_details.get('shortDescription')
2019 if video_description is None:
2020 video_description = self._html_search_meta('description', video_webpage)
2021
2022 if not smuggled_data.get('force_singlefeed', False):
2023 if not self._downloader.params.get('noplaylist'):
2024 multifeed_metadata_list = try_get(
2025 player_response,
2026 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2027 compat_str) or try_get(
2028 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
2029 if multifeed_metadata_list:
2030 entries = []
2031 feed_ids = []
2032 for feed in multifeed_metadata_list.split(','):
2033 # Unquote should take place before split on comma (,) since textual
2034 # fields may contain comma as well (see
2035 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2036 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
2037
2038 def feed_entry(name):
2039 return try_get(feed_data, lambda x: x[name][0], compat_str)
2040
2041 feed_id = feed_entry('id')
2042 if not feed_id:
2043 continue
2044 feed_title = feed_entry('title')
2045 title = video_title
2046 if feed_title:
2047 title += ' (%s)' % feed_title
2048 entries.append({
2049 '_type': 'url_transparent',
2050 'ie_key': 'Youtube',
2051 'url': smuggle_url(
2052 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
2053 {'force_singlefeed': True}),
2054 'title': title,
2055 })
2056 feed_ids.append(feed_id)
2057 self.to_screen(
2058 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2059 % (', '.join(feed_ids), video_id))
2060 return self.playlist_result(entries, video_id, video_title, video_description)
2061 else:
2062 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2063
2064 if view_count is None:
2065 view_count = extract_view_count(video_info)
2066 if view_count is None and video_details:
2067 view_count = int_or_none(video_details.get('viewCount'))
2068 if view_count is None and microformat:
2069 view_count = int_or_none(microformat.get('viewCount'))
2070
2071 if is_live is None:
2072 is_live = bool_or_none(video_details.get('isLive'))
2073
2074 has_live_chat_replay = False
2075 if not is_live:
2076 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2077 try:
2078 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2079 has_live_chat_replay = True
2080 except (KeyError, IndexError, TypeError):
2081 pass
2082
2083 # Check for "rental" videos
2084 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2085 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
2086
2087 def _extract_filesize(media_url):
2088 return int_or_none(self._search_regex(
2089 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2090
2091 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2092 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2093
2094 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2095 self.report_rtmp_download()
2096 formats = [{
2097 'format_id': '_rtmp',
2098 'protocol': 'rtmp',
2099 'url': video_info['conn'][0],
2100 'player_url': player_url,
2101 }]
2102 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
2103 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
2104 if 'rtmpe%3Dyes' in encoded_url_map:
2105 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
2106 formats = []
2107 formats_spec = {}
2108 fmt_list = video_info.get('fmt_list', [''])[0]
2109 if fmt_list:
2110 for fmt in fmt_list.split(','):
2111 spec = fmt.split('/')
2112 if len(spec) > 1:
2113 width_height = spec[1].split('x')
2114 if len(width_height) == 2:
2115 formats_spec[spec[0]] = {
2116 'resolution': spec[1],
2117 'width': int_or_none(width_height[0]),
2118 'height': int_or_none(width_height[1]),
2119 }
2120 for fmt in streaming_formats:
2121 itag = str_or_none(fmt.get('itag'))
2122 if not itag:
2123 continue
2124 quality = fmt.get('quality')
2125 quality_label = fmt.get('qualityLabel') or quality
2126 formats_spec[itag] = {
2127 'asr': int_or_none(fmt.get('audioSampleRate')),
2128 'filesize': int_or_none(fmt.get('contentLength')),
2129 'format_note': quality_label,
2130 'fps': int_or_none(fmt.get('fps')),
2131 'height': int_or_none(fmt.get('height')),
2132 # bitrate for itag 43 is always 2147483647
2133 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2134 'width': int_or_none(fmt.get('width')),
2135 }
2136
2137 for fmt in streaming_formats:
2138 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2139 continue
2140 url = url_or_none(fmt.get('url'))
2141
2142 if not url:
2143 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2144 if not cipher:
2145 continue
2146 url_data = compat_parse_qs(cipher)
2147 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2148 if not url:
2149 continue
2150 else:
2151 cipher = None
2152 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2153
2154 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2155 # Unsupported FORMAT_STREAM_TYPE_OTF
2156 if stream_type == 3:
2157 continue
2158
2159 format_id = fmt.get('itag') or url_data['itag'][0]
2160 if not format_id:
2161 continue
2162 format_id = compat_str(format_id)
2163
2164 if cipher:
2165 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2166 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
2167 jsplayer_url_json = self._search_regex(
2168 ASSETS_RE,
2169 embed_webpage if age_gate else video_webpage,
2170 'JS player URL (1)', default=None)
2171 if not jsplayer_url_json and not age_gate:
2172 # We need the embed website after all
2173 if embed_webpage is None:
2174 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2175 embed_webpage = self._download_webpage(
2176 embed_url, video_id, 'Downloading embed webpage')
2177 jsplayer_url_json = self._search_regex(
2178 ASSETS_RE, embed_webpage, 'JS player URL')
2179
2180 player_url = json.loads(jsplayer_url_json)
2181 if player_url is None:
2182 player_url_json = self._search_regex(
2183 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2184 video_webpage, 'age gate player URL')
2185 player_url = json.loads(player_url_json)
2186
2187 if 'sig' in url_data:
2188 url += '&signature=' + url_data['sig'][0]
2189 elif 's' in url_data:
2190 encrypted_sig = url_data['s'][0]
2191
2192 if self._downloader.params.get('verbose'):
2193 if player_url is None:
2194 player_desc = 'unknown'
2195 else:
2196 player_type, player_version = self._extract_player_info(player_url)
2197 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2198 parts_sizes = self._signature_cache_id(encrypted_sig)
2199 self.to_screen('{%s} signature length %s, %s' %
2200 (format_id, parts_sizes, player_desc))
2201
2202 signature = self._decrypt_signature(
2203 encrypted_sig, video_id, player_url, age_gate)
2204 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2205 url += '&%s=%s' % (sp, signature)
2206 if 'ratebypass' not in url:
2207 url += '&ratebypass=yes'
2208
2209 dct = {
2210 'format_id': format_id,
2211 'url': url,
2212 'player_url': player_url,
2213 }
2214 if format_id in self._formats:
2215 dct.update(self._formats[format_id])
2216 if format_id in formats_spec:
2217 dct.update(formats_spec[format_id])
2218
2219 # Some itags are not included in DASH manifest thus corresponding formats will
2220 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2221 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2222 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2223 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2224
2225 if width is None:
2226 width = int_or_none(fmt.get('width'))
2227 if height is None:
2228 height = int_or_none(fmt.get('height'))
2229
2230 filesize = int_or_none(url_data.get(
2231 'clen', [None])[0]) or _extract_filesize(url)
2232
2233 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2234 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2235
2236 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2237 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2238 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2239
2240 more_fields = {
2241 'filesize': filesize,
2242 'tbr': tbr,
2243 'width': width,
2244 'height': height,
2245 'fps': fps,
2246 'format_note': quality_label or quality,
2247 }
2248 for key, value in more_fields.items():
2249 if value:
2250 dct[key] = value
2251 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2252 if type_:
2253 type_split = type_.split(';')
2254 kind_ext = type_split[0].split('/')
2255 if len(kind_ext) == 2:
2256 kind, _ = kind_ext
2257 dct['ext'] = mimetype2ext(type_split[0])
2258 if kind in ('audio', 'video'):
2259 codecs = None
2260 for mobj in re.finditer(
2261 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2262 if mobj.group('key') == 'codecs':
2263 codecs = mobj.group('val')
2264 break
2265 if codecs:
2266 dct.update(parse_codecs(codecs))
2267 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2268 dct['downloader_options'] = {
2269 # Youtube throttles chunks >~10M
2270 'http_chunk_size': 10485760,
2271 }
2272 formats.append(dct)
2273 else:
2274 manifest_url = (
2275 url_or_none(try_get(
2276 player_response,
2277 lambda x: x['streamingData']['hlsManifestUrl'],
2278 compat_str))
2279 or url_or_none(try_get(
2280 video_info, lambda x: x['hlsvp'][0], compat_str)))
2281 if manifest_url:
2282 formats = []
2283 m3u8_formats = self._extract_m3u8_formats(
2284 manifest_url, video_id, 'mp4', fatal=False)
2285 for a_format in m3u8_formats:
2286 itag = self._search_regex(
2287 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2288 if itag:
2289 a_format['format_id'] = itag
2290 if itag in self._formats:
2291 dct = self._formats[itag].copy()
2292 dct.update(a_format)
2293 a_format = dct
2294 a_format['player_url'] = player_url
2295 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2296 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2297 if self._downloader.params.get('youtube_include_hls_manifest', True):
2298 formats.append(a_format)
2299 else:
2300 error_message = extract_unavailable_message()
2301 if not error_message:
2302 error_message = clean_html(try_get(
2303 player_response, lambda x: x['playabilityStatus']['reason'],
2304 compat_str))
2305 if not error_message:
2306 error_message = clean_html(
2307 try_get(video_info, lambda x: x['reason'][0], compat_str))
2308 if error_message:
2309 raise ExtractorError(error_message, expected=True)
2310 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2311
2312 # uploader
2313 video_uploader = try_get(
2314 video_info, lambda x: x['author'][0],
2315 compat_str) or str_or_none(video_details.get('author'))
2316 if video_uploader:
2317 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2318 else:
2319 self._downloader.report_warning('unable to extract uploader name')
2320
2321 # uploader_id
2322 video_uploader_id = None
2323 video_uploader_url = None
2324 mobj = re.search(
2325 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2326 video_webpage)
2327 if mobj is not None:
2328 video_uploader_id = mobj.group('uploader_id')
2329 video_uploader_url = mobj.group('uploader_url')
2330 else:
2331 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2332 if owner_profile_url:
2333 video_uploader_id = self._search_regex(
2334 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2335 default=None)
2336 video_uploader_url = owner_profile_url
2337
2338 channel_id = (
2339 str_or_none(video_details.get('channelId'))
2340 or self._html_search_meta(
2341 'channelId', video_webpage, 'channel id', default=None)
2342 or self._search_regex(
2343 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2344 video_webpage, 'channel id', default=None, group='id'))
2345 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2346
2347 thumbnails = []
2348 thumbnails_list = try_get(
2349 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2350 for t in thumbnails_list:
2351 if not isinstance(t, dict):
2352 continue
2353 thumbnail_url = url_or_none(t.get('url'))
2354 if not thumbnail_url:
2355 continue
2356 thumbnails.append({
2357 'url': thumbnail_url,
2358 'width': int_or_none(t.get('width')),
2359 'height': int_or_none(t.get('height')),
2360 })
2361
2362 if not thumbnails:
2363 video_thumbnail = None
2364 # We try first to get a high quality image:
2365 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2366 video_webpage, re.DOTALL)
2367 if m_thumb is not None:
2368 video_thumbnail = m_thumb.group(1)
2369 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2370 if thumbnail_url:
2371 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2372 if video_thumbnail:
2373 thumbnails.append({'url': video_thumbnail})
2374
2375 # upload date
2376 upload_date = self._html_search_meta(
2377 'datePublished', video_webpage, 'upload date', default=None)
2378 if not upload_date:
2379 upload_date = self._search_regex(
2380 [r'(?s)id="eow-date.*?>(.*?)</span>',
2381 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2382 video_webpage, 'upload date', default=None)
2383 if not upload_date:
2384 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2385 upload_date = unified_strdate(upload_date)
2386
2387 video_license = self._html_search_regex(
2388 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2389 video_webpage, 'license', default=None)
2390
2391 m_music = re.search(
2392 r'''(?x)
2393 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2394 <ul[^>]*>\s*
2395 <li>(?P<title>.+?)
2396 by (?P<creator>.+?)
2397 (?:
2398 \(.+?\)|
2399 <a[^>]*
2400 (?:
2401 \bhref=["\']/red[^>]*>| # drop possible
2402 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2403 )
2404 .*?
2405 )?</li
2406 ''',
2407 video_webpage)
2408 if m_music:
2409 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2410 video_creator = clean_html(m_music.group('creator'))
2411 else:
2412 video_alt_title = video_creator = None
2413
2414 def extract_meta(field):
2415 return self._html_search_regex(
2416 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2417 video_webpage, field, default=None)
2418
2419 track = extract_meta('Song')
2420 artist = extract_meta('Artist')
2421 album = extract_meta('Album')
2422
2423 # Youtube Music Auto-generated description
2424 release_date = release_year = None
2425 if video_description:
2426 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2427 if mobj:
2428 if not track:
2429 track = mobj.group('track').strip()
2430 if not artist:
2431 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2432 if not album:
2433 album = mobj.group('album'.strip())
2434 release_year = mobj.group('release_year')
2435 release_date = mobj.group('release_date')
2436 if release_date:
2437 release_date = release_date.replace('-', '')
2438 if not release_year:
2439 release_year = int(release_date[:4])
2440 if release_year:
2441 release_year = int(release_year)
2442
2443 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2444 if yt_initial:
2445 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2446 if len(music_metadata):
2447 album = music_metadata[0].get('album')
2448 artist = music_metadata[0].get('artist')
2449 track = music_metadata[0].get('track')
2450
2451 m_episode = re.search(
2452 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2453 video_webpage)
2454 if m_episode:
2455 series = unescapeHTML(m_episode.group('series'))
2456 season_number = int(m_episode.group('season'))
2457 episode_number = int(m_episode.group('episode'))
2458 else:
2459 series = season_number = episode_number = None
2460
2461 m_cat_container = self._search_regex(
2462 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2463 video_webpage, 'categories', default=None)
2464 category = None
2465 if m_cat_container:
2466 category = self._html_search_regex(
2467 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2468 default=None)
2469 if not category:
2470 category = try_get(
2471 microformat, lambda x: x['category'], compat_str)
2472 video_categories = None if category is None else [category]
2473
2474 video_tags = [
2475 unescapeHTML(m.group('content'))
2476 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2477 if not video_tags:
2478 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2479
2480 def _extract_count(count_name):
2481 return str_to_int(self._search_regex(
2482 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
2483 % re.escape(count_name),
2484 video_webpage, count_name, default=None))
2485
2486 like_count = _extract_count('like')
2487 dislike_count = _extract_count('dislike')
2488
2489 if view_count is None:
2490 view_count = str_to_int(self._search_regex(
2491 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2492 'view count', default=None))
2493
2494 average_rating = (
2495 float_or_none(video_details.get('averageRating'))
2496 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2497
2498 # subtitles
2499 video_subtitles = self.extract_subtitles(
2500 video_id, video_webpage, has_live_chat_replay)
2501 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2502
2503 video_duration = try_get(
2504 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2505 if not video_duration:
2506 video_duration = int_or_none(video_details.get('lengthSeconds'))
2507 if not video_duration:
2508 video_duration = parse_duration(self._html_search_meta(
2509 'duration', video_webpage, 'video duration'))
2510
2511 # Get Subscriber Count of channel
2512 subscriber_count = parse_count(self._search_regex(
2513 r'"text":"([\d\.]+\w?) subscribers"',
2514 video_webpage,
2515 'subscriber count',
2516 default=None
2517 ))
2518
2519 # annotations
2520 video_annotations = None
2521 if self._downloader.params.get('writeannotations', False):
2522 xsrf_token = self._search_regex(
2523 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2524 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2525 invideo_url = try_get(
2526 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2527 if xsrf_token and invideo_url:
2528 xsrf_field_name = self._search_regex(
2529 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2530 video_webpage, 'xsrf field name',
2531 group='xsrf_field_name', default='session_token')
2532 video_annotations = self._download_webpage(
2533 self._proto_relative_url(invideo_url),
2534 video_id, note='Downloading annotations',
2535 errnote='Unable to download video annotations', fatal=False,
2536 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2537
2538 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2539
2540 # Look for the DASH manifest
2541 if self._downloader.params.get('youtube_include_dash_manifest', True):
2542 dash_mpd_fatal = True
2543 for mpd_url in dash_mpds:
2544 dash_formats = {}
2545 try:
2546 def decrypt_sig(mobj):
2547 s = mobj.group(1)
2548 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2549 return '/signature/%s' % dec_s
2550
2551 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2552
2553 for df in self._extract_mpd_formats(
2554 mpd_url, video_id, fatal=dash_mpd_fatal,
2555 formats_dict=self._formats):
2556 if not df.get('filesize'):
2557 df['filesize'] = _extract_filesize(df['url'])
2558 # Do not overwrite DASH format found in some previous DASH manifest
2559 if df['format_id'] not in dash_formats:
2560 dash_formats[df['format_id']] = df
2561 # Additional DASH manifests may end up in HTTP Error 403 therefore
2562 # allow them to fail without bug report message if we already have
2563 # some DASH manifest succeeded. This is temporary workaround to reduce
2564 # burst of bug reports until we figure out the reason and whether it
2565 # can be fixed at all.
2566 dash_mpd_fatal = False
2567 except (ExtractorError, KeyError) as e:
2568 self.report_warning(
2569 'Skipping DASH manifest: %r' % e, video_id)
2570 if dash_formats:
2571 # Remove the formats we found through non-DASH, they
2572 # contain less info and it can be wrong, because we use
2573 # fixed values (for example the resolution). See
2574 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2575 # example.
2576 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2577 formats.extend(dash_formats.values())
2578
2579 # Check for malformed aspect ratio
2580 stretched_m = re.search(
2581 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2582 video_webpage)
2583 if stretched_m:
2584 w = float(stretched_m.group('w'))
2585 h = float(stretched_m.group('h'))
2586 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2587 # We will only process correct ratios.
2588 if w > 0 and h > 0:
2589 ratio = w / h
2590 for f in formats:
2591 if f.get('vcodec') != 'none':
2592 f['stretched_ratio'] = ratio
2593
2594 if not formats:
2595 if 'reason' in video_info:
2596 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2597 regions_allowed = self._html_search_meta(
2598 'regionsAllowed', video_webpage, default=None)
2599 countries = regions_allowed.split(',') if regions_allowed else None
2600 self.raise_geo_restricted(
2601 msg=video_info['reason'][0], countries=countries)
2602 reason = video_info['reason'][0]
2603 if 'Invalid parameters' in reason:
2604 unavailable_message = extract_unavailable_message()
2605 if unavailable_message:
2606 reason = unavailable_message
2607 raise ExtractorError(
2608 'YouTube said: %s' % reason,
2609 expected=True, video_id=video_id)
2610 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2611 raise ExtractorError('This video is DRM protected.', expected=True)
2612
2613 self._sort_formats(formats)
2614
2615 self.mark_watched(video_id, video_info, player_response)
2616
2617 return {
2618 'id': video_id,
2619 'uploader': video_uploader,
2620 'uploader_id': video_uploader_id,
2621 'uploader_url': video_uploader_url,
2622 'channel_id': channel_id,
2623 'channel_url': channel_url,
2624 'upload_date': upload_date,
2625 'license': video_license,
2626 'creator': video_creator or artist,
2627 'title': video_title,
2628 'alt_title': video_alt_title or track,
2629 'thumbnails': thumbnails,
2630 'description': video_description,
2631 'categories': video_categories,
2632 'tags': video_tags,
2633 'subtitles': video_subtitles,
2634 'automatic_captions': automatic_captions,
2635 'duration': video_duration,
2636 'age_limit': 18 if age_gate else 0,
2637 'annotations': video_annotations,
2638 'chapters': chapters,
2639 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2640 'view_count': view_count,
2641 'like_count': like_count,
2642 'dislike_count': dislike_count,
2643 'average_rating': average_rating,
2644 'formats': formats,
2645 'is_live': is_live,
2646 'start_time': start_time,
2647 'end_time': end_time,
2648 'series': series,
2649 'season_number': season_number,
2650 'episode_number': episode_number,
2651 'track': track,
2652 'artist': artist,
2653 'album': album,
2654 'release_date': release_date,
2655 'release_year': release_year,
2656 'subscriber_count': subscriber_count,
2657 }
2658
2659
2660class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2661 IE_DESC = 'YouTube.com playlists'
2662 _VALID_URL = r"""(?x)(?:
2663 (?:https?://)?
2664 (?:\w+\.)?
2665 (?:
2666 (?:
2667 youtube(?:kids)?\.com|
2668 invidio\.us
2669 )
2670 /
2671 (?:
2672 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2673 \? (?:.*?[&;])*? (?:p|a|list)=
2674 | p/
2675 )|
2676 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2677 )
2678 (
2679 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2680 # Top tracks, they can also include dots
2681 |(?:MC)[\w\.]*
2682 )
2683 .*
2684 |
2685 (%(playlist_id)s)
2686 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2687 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2688 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2689 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2690 IE_NAME = 'youtube:playlist'
2691 _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_'
2692 _YTM_CHANNEL_INFO = {
2693 'uploader': 'Youtube Music',
2694 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
2695 'uploader_url': 'https://www.youtube.com/music'
2696 }
2697 _TESTS = [{
2698 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2699 'info_dict': {
2700 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2701 'uploader': 'Sergey M.',
2702 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2703 'title': 'youtube-dl public playlist',
2704 },
2705 'playlist_count': 1,
2706 }, {
2707 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2708 'info_dict': {
2709 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2710 'uploader': 'Sergey M.',
2711 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2712 'title': 'youtube-dl empty playlist',
2713 },
2714 'playlist_count': 0,
2715 }, {
2716 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2717 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2718 'info_dict': {
2719 'title': '29C3: Not my department',
2720 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2721 'uploader': 'Christiaan008',
2722 'uploader_id': 'ChRiStIaAn008',
2723 },
2724 'playlist_count': 96,
2725 }, {
2726 'note': 'issue #673',
2727 'url': 'PLBB231211A4F62143',
2728 'info_dict': {
2729 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2730 'id': 'PLBB231211A4F62143',
2731 'uploader': 'Wickydoo',
2732 'uploader_id': 'Wickydoo',
2733 },
2734 'playlist_mincount': 26,
2735 }, {
2736 'note': 'Large playlist',
2737 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2738 'info_dict': {
2739 'title': 'Uploads from Cauchemar',
2740 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2741 'uploader': 'Cauchemar',
2742 'uploader_id': 'Cauchemar89',
2743 },
2744 'playlist_mincount': 799,
2745 }, {
2746 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2747 'info_dict': {
2748 'title': 'YDL_safe_search',
2749 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2750 },
2751 'playlist_count': 2,
2752 'skip': 'This playlist is private',
2753 }, {
2754 'note': 'embedded',
2755 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2756 'playlist_count': 4,
2757 'info_dict': {
2758 'title': 'JODA15',
2759 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2760 'uploader': 'milan',
2761 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2762 }
2763 }, {
2764 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2765 'playlist_mincount': 485,
2766 'info_dict': {
2767 'title': '2018 Chinese New Singles (11/6 updated)',
2768 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2769 'uploader': 'LBK',
2770 'uploader_id': 'sdragonfang',
2771 }
2772 }, {
2773 'note': 'Embedded SWF player',
2774 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2775 'playlist_count': 4,
2776 'info_dict': {
2777 'title': 'JODA7',
2778 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2779 },
2780 'skip': 'This playlist does not exist',
2781 }, {
2782 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2783 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2784 'info_dict': {
2785 'title': 'Uploads from Interstellar Movie',
2786 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2787 'uploader': 'Interstellar Movie',
2788 'uploader_id': 'InterstellarMovie1',
2789 },
2790 'playlist_mincount': 21,
2791 }, {
2792 # Playlist URL that does not actually serve a playlist
2793 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2794 'info_dict': {
2795 'id': 'FqZTN594JQw',
2796 'ext': 'webm',
2797 'title': "Smiley's People 01 detective, Adventure Series, Action",
2798 'uploader': 'STREEM',
2799 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2800 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2801 'upload_date': '20150526',
2802 'license': 'Standard YouTube License',
2803 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2804 'categories': ['People & Blogs'],
2805 'tags': list,
2806 'view_count': int,
2807 'like_count': int,
2808 'dislike_count': int,
2809 },
2810 'params': {
2811 'skip_download': True,
2812 },
2813 'skip': 'This video is not available.',
2814 'add_ie': [YoutubeIE.ie_key()],
2815 }, {
2816 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2817 'info_dict': {
2818 'id': 'yeWKywCrFtk',
2819 'ext': 'mp4',
2820 'title': 'Small Scale Baler and Braiding Rugs',
2821 'uploader': 'Backus-Page House Museum',
2822 'uploader_id': 'backuspagemuseum',
2823 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2824 'upload_date': '20161008',
2825 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2826 'categories': ['Nonprofits & Activism'],
2827 'tags': list,
2828 'like_count': int,
2829 'dislike_count': int,
2830 },
2831 'params': {
2832 'noplaylist': True,
2833 'skip_download': True,
2834 },
2835 }, {
2836 # https://github.com/ytdl-org/youtube-dl/issues/21844
2837 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2838 'info_dict': {
2839 'title': 'Data Analysis with Dr Mike Pound',
2840 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2841 'uploader_id': 'Computerphile',
2842 'uploader': 'Computerphile',
2843 },
2844 'playlist_mincount': 11,
2845 }, {
2846 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2847 'only_matching': True,
2848 }, {
2849 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2850 'only_matching': True,
2851 }, {
2852 # music album playlist
2853 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2854 'only_matching': True,
2855 }, {
2856 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2857 'only_matching': True,
2858 }, {
2859 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2860 'only_matching': True,
2861 }]
2862
2863 def _real_initialize(self):
2864 self._login()
2865
2866 def extract_videos_from_page(self, page):
2867 ids_in_page = []
2868 titles_in_page = []
2869
2870 for item in re.findall(
2871 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2872 attrs = extract_attributes(item)
2873 video_id = attrs['data-video-id']
2874 video_title = unescapeHTML(attrs.get('data-title'))
2875 if video_title:
2876 video_title = video_title.strip()
2877 ids_in_page.append(video_id)
2878 titles_in_page.append(video_title)
2879
2880 # Fallback with old _VIDEO_RE
2881 self.extract_videos_from_page_impl(
2882 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2883
2884 # Relaxed fallbacks
2885 self.extract_videos_from_page_impl(
2886 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2887 ids_in_page, titles_in_page)
2888 self.extract_videos_from_page_impl(
2889 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2890 ids_in_page, titles_in_page)
2891
2892 return zip(ids_in_page, titles_in_page)
2893
2894 def _extract_mix_ids_from_yt_initial(self, yt_initial):
2895 ids = []
2896 playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
2897 if playlist_contents:
2898 for item in playlist_contents:
2899 videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
2900 if videoId:
2901 ids.append(videoId)
2902 return ids
2903
2904 def _extract_mix(self, playlist_id):
2905 # The mixes are generated from a single video
2906 # the id of the playlist is just 'RD' + video_id
2907 ids = []
2908 yt_initial = None
2909 last_id = playlist_id[-11:]
2910 for n in itertools.count(1):
2911 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2912 webpage = self._download_webpage(
2913 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2914 new_ids = orderedSet(re.findall(
2915 r'''(?xs)data-video-username=".*?".*?
2916 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2917 webpage))
2918
2919 # if no ids in html of page, try using embedded json
2920 if (len(new_ids) == 0):
2921 yt_initial = self._get_yt_initial_data(playlist_id, webpage)
2922 if yt_initial:
2923 new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
2924
2925 # Fetch new pages until all the videos are repeated, it seems that
2926 # there are always 51 unique videos.
2927 new_ids = [_id for _id in new_ids if _id not in ids]
2928 if not new_ids:
2929 break
2930 ids.extend(new_ids)
2931 last_id = ids[-1]
2932
2933 url_results = self._ids_to_results(ids)
2934
2935 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2936 title_span = (
2937 search_title('playlist-title')
2938 or search_title('title long-title')
2939 or search_title('title'))
2940 title = clean_html(title_span)
2941
2942 if not title:
2943 title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str)
2944
2945 return self.playlist_result(url_results, playlist_id, title)
2946
2947 def _extract_playlist(self, playlist_id):
2948 url = self._TEMPLATE_URL % playlist_id
2949 page = self._download_webpage(url, playlist_id)
2950
2951 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2952 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2953 match = match.strip()
2954 # Check if the playlist exists or is private
2955 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2956 if mobj:
2957 reason = mobj.group('reason')
2958 message = 'This playlist %s' % reason
2959 if 'private' in reason:
2960 message += ', use --username or --netrc to access it'
2961 message += '.'
2962 raise ExtractorError(message, expected=True)
2963 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2964 raise ExtractorError(
2965 'Invalid parameters. Maybe URL is incorrect.',
2966 expected=True)
2967 elif re.match(r'[^<]*Choose your language[^<]*', match):
2968 continue
2969 else:
2970 self.report_warning('Youtube gives an alert message: ' + match)
2971
2972 playlist_title = self._html_search_regex(
2973 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2974 page, 'title', default=None)
2975
2976 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2977 uploader = self._html_search_regex(
2978 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2979 page, 'uploader', default=None)
2980 mobj = re.search(
2981 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2982 page)
2983 if mobj:
2984 uploader_id = mobj.group('uploader_id')
2985 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2986 else:
2987 uploader_id = uploader_url = None
2988
2989 has_videos = True
2990
2991 if not playlist_title:
2992 try:
2993 # Some playlist URLs don't actually serve a playlist (e.g.
2994 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2995 next(self._entries(page, playlist_id))
2996 except StopIteration:
2997 has_videos = False
2998
2999 playlist = self.playlist_result(
3000 self._entries(page, playlist_id), playlist_id, playlist_title)
3001 playlist.update({
3002 'uploader': uploader,
3003 'uploader_id': uploader_id,
3004 'uploader_url': uploader_url,
3005 })
3006 if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
3007 playlist.update(self._YTM_CHANNEL_INFO)
3008
3009 return has_videos, playlist
3010
3011 def _check_download_just_video(self, url, playlist_id):
3012 # Check if it's a video-specific URL
3013 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3014 video_id = query_dict.get('v', [None])[0] or self._search_regex(
3015 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
3016 'video id', default=None)
3017 if video_id:
3018 if self._downloader.params.get('noplaylist'):
3019 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3020 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
3021 else:
3022 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3023 return video_id, None
3024 return None, None
3025
3026 def _real_extract(self, url):
3027 # Extract playlist id
3028 mobj = re.match(self._VALID_URL, url)
3029 if mobj is None:
3030 raise ExtractorError('Invalid URL: %s' % url)
3031 playlist_id = mobj.group(1) or mobj.group(2)
3032
3033 video_id, video = self._check_download_just_video(url, playlist_id)
3034 if video:
3035 return video
3036
3037 if playlist_id.startswith(('RD', 'UL', 'PU')):
3038 if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
3039 # Mixes require a custom extraction process,
3040 # Youtube Music playlists act like normal playlists (with randomized order)
3041 return self._extract_mix(playlist_id)
3042
3043 has_videos, playlist = self._extract_playlist(playlist_id)
3044 if has_videos or not video_id:
3045 return playlist
3046
3047 # Some playlist URLs don't actually serve a playlist (see
3048 # https://github.com/ytdl-org/youtube-dl/issues/10537).
3049 # Fallback to plain video extraction if there is a video id
3050 # along with playlist id.
3051 return self.url_result(video_id, 'Youtube', video_id=video_id)
3052
3053
3054class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
3055 IE_DESC = 'YouTube.com channels'
3056 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
3057 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
3058 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
3059 IE_NAME = 'youtube:channel'
3060 _TESTS = [{
3061 'note': 'paginated channel',
3062 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
3063 'playlist_mincount': 91,
3064 'info_dict': {
3065 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
3066 'title': 'Uploads from lex will',
3067 'uploader': 'lex will',
3068 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3069 }
3070 }, {
3071 'note': 'Age restricted channel',
3072 # from https://www.youtube.com/user/DeusExOfficial
3073 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
3074 'playlist_mincount': 64,
3075 'info_dict': {
3076 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
3077 'title': 'Uploads from Deus Ex',
3078 'uploader': 'Deus Ex',
3079 'uploader_id': 'DeusExOfficial',
3080 },
3081 }, {
3082 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
3083 'only_matching': True,
3084 }, {
3085 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
3086 'only_matching': True,
3087 }]
3088
3089 @classmethod
3090 def suitable(cls, url):
3091 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3092 else super(YoutubeChannelIE, cls).suitable(url))
3093
3094 def _build_template_url(self, url, channel_id):
3095 return self._TEMPLATE_URL % channel_id
3096
3097 def _real_extract(self, url):
3098 channel_id = self._match_id(url)
3099
3100 url = self._build_template_url(url, channel_id)
3101
3102 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3103 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3104 # otherwise fallback on channel by page extraction
3105 channel_page = self._download_webpage(
3106 url + '?view=57', channel_id,
3107 'Downloading channel page', fatal=False)
3108 if channel_page is False:
3109 channel_playlist_id = False
3110 else:
3111 channel_playlist_id = self._html_search_meta(
3112 'channelId', channel_page, 'channel id', default=None)
3113 if not channel_playlist_id:
3114 channel_url = self._html_search_meta(
3115 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3116 channel_page, 'channel url', default=None)
3117 if channel_url:
3118 channel_playlist_id = self._search_regex(
3119 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3120 channel_url, 'channel id', default=None)
3121 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3122 playlist_id = 'UU' + channel_playlist_id[2:]
3123 return self.url_result(
3124 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
3125
3126 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
3127 autogenerated = re.search(r'''(?x)
3128 class="[^"]*?(?:
3129 channel-header-autogenerated-label|
3130 yt-channel-title-autogenerated
3131 )[^"]*"''', channel_page) is not None
3132
3133 if autogenerated:
3134 # The videos are contained in a single page
3135 # the ajax pages can't be used, they are empty
3136 entries = [
3137 self.url_result(
3138 video_id, 'Youtube', video_id=video_id,
3139 video_title=video_title)
3140 for video_id, video_title in self.extract_videos_from_page(channel_page)]
3141 return self.playlist_result(entries, channel_id)
3142
3143 try:
3144 next(self._entries(channel_page, channel_id))
3145 except StopIteration:
3146 alert_message = self._html_search_regex(
3147 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3148 channel_page, 'alert', default=None, group='alert')
3149 if alert_message:
3150 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3151
3152 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3153
3154
3155class YoutubeUserIE(YoutubeChannelIE):
3156 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3157 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3158 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3159 IE_NAME = 'youtube:user'
3160
3161 _TESTS = [{
3162 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3163 'playlist_mincount': 320,
3164 'info_dict': {
3165 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3166 'title': 'Uploads from The Linux Foundation',
3167 'uploader': 'The Linux Foundation',
3168 'uploader_id': 'TheLinuxFoundation',
3169 }
3170 }, {
3171 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3172 # but not https://www.youtube.com/user/12minuteathlete/videos
3173 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3174 'playlist_mincount': 249,
3175 'info_dict': {
3176 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3177 'title': 'Uploads from 12 Minute Athlete',
3178 'uploader': '12 Minute Athlete',
3179 'uploader_id': 'the12minuteathlete',
3180 }
3181 }, {
3182 'url': 'ytuser:phihag',
3183 'only_matching': True,
3184 }, {
3185 'url': 'https://www.youtube.com/c/gametrailers',
3186 'only_matching': True,
3187 }, {
3188 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3189 'only_matching': True,
3190 }, {
3191 'url': 'https://www.youtube.com/gametrailers',
3192 'only_matching': True,
3193 }, {
3194 # This channel is not available, geo restricted to JP
3195 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3196 'only_matching': True,
3197 }]
3198
3199 @classmethod
3200 def suitable(cls, url):
3201 # Don't return True if the url can be extracted with other youtube
3202 # extractor, the regex would is too permissive and it would match.
3203 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3204 if any(ie.suitable(url) for ie in other_yt_ies):
3205 return False
3206 else:
3207 return super(YoutubeUserIE, cls).suitable(url)
3208
3209 def _build_template_url(self, url, channel_id):
3210 mobj = re.match(self._VALID_URL, url)
3211 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3212
3213
3214class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3215 IE_DESC = 'YouTube.com live streams'
3216 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3217 IE_NAME = 'youtube:live'
3218
3219 _TESTS = [{
3220 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3221 'info_dict': {
3222 'id': 'a48o2S1cPoo',
3223 'ext': 'mp4',
3224 'title': 'The Young Turks - Live Main Show',
3225 'uploader': 'The Young Turks',
3226 'uploader_id': 'TheYoungTurks',
3227 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3228 'upload_date': '20150715',
3229 'license': 'Standard YouTube License',
3230 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3231 'categories': ['News & Politics'],
3232 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3233 'like_count': int,
3234 'dislike_count': int,
3235 },
3236 'params': {
3237 'skip_download': True,
3238 },
3239 }, {
3240 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3241 'only_matching': True,
3242 }, {
3243 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3244 'only_matching': True,
3245 }, {
3246 'url': 'https://www.youtube.com/TheYoungTurks/live',
3247 'only_matching': True,
3248 }]
3249
3250 def _real_extract(self, url):
3251 mobj = re.match(self._VALID_URL, url)
3252 channel_id = mobj.group('id')
3253 base_url = mobj.group('base_url')
3254 webpage = self._download_webpage(url, channel_id, fatal=False)
3255 if webpage:
3256 page_type = self._og_search_property(
3257 'type', webpage, 'page type', default='')
3258 video_id = self._html_search_meta(
3259 'videoId', webpage, 'video id', default=None)
3260 if page_type.startswith('video') and video_id and re.match(
3261 r'^[0-9A-Za-z_-]{11}$', video_id):
3262 return self.url_result(video_id, YoutubeIE.ie_key())
3263 return self.url_result(base_url)
3264
3265
3266class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3267 IE_DESC = 'YouTube.com user/channel playlists'
3268 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3269 IE_NAME = 'youtube:playlists'
3270
3271 _TESTS = [{
3272 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3273 'playlist_mincount': 4,
3274 'info_dict': {
3275 'id': 'ThirstForScience',
3276 'title': 'ThirstForScience',
3277 },
3278 }, {
3279 # with "Load more" button
3280 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3281 'playlist_mincount': 70,
3282 'info_dict': {
3283 'id': 'igorkle1',
3284 'title': 'Игорь Клейнер',
3285 },
3286 }, {
3287 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3288 'playlist_mincount': 17,
3289 'info_dict': {
3290 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3291 'title': 'Chem Player',
3292 },
3293 'skip': 'Blocked',
3294 }, {
3295 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3296 'only_matching': True,
3297 }]
3298
3299
3300class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
3301 IE_DESC = 'YouTube.com searches'
3302 # there doesn't appear to be a real limit, for example if you search for
3303 # 'python' you get more than 8.000.000 results
3304 _MAX_RESULTS = float('inf')
3305 IE_NAME = 'youtube:search'
3306 _SEARCH_KEY = 'ytsearch'
3307 _SEARCH_PARAMS = None
3308 _TESTS = []
3309
3310 def _entries(self, query, n):
3311 data = {
3312 'context': {
3313 'client': {
3314 'clientName': 'WEB',
3315 'clientVersion': '2.20201021.03.00',
3316 }
3317 },
3318 'query': query,
3319 }
3320 if self._SEARCH_PARAMS:
3321 data['params'] = self._SEARCH_PARAMS
3322 total = 0
3323 for page_num in itertools.count(1):
3324 search = self._download_json(
3325 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3326 video_id='query "%s"' % query,
3327 note='Downloading page %s' % page_num,
3328 errnote='Unable to download API page', fatal=False,
3329 data=json.dumps(data).encode('utf8'),
3330 headers={'content-type': 'application/json'})
3331 if not search:
3332 break
3333 slr_contents = try_get(
3334 search,
3335 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3336 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3337 list)
3338 if not slr_contents:
3339 break
3340 isr_contents = try_get(
3341 slr_contents,
3342 lambda x: x[0]['itemSectionRenderer']['contents'],
3343 list)
3344 if not isr_contents:
3345 break
3346 for content in isr_contents:
3347 if not isinstance(content, dict):
3348 continue
3349 video = content.get('videoRenderer')
3350 if not isinstance(video, dict):
3351 continue
3352 video_id = video.get('videoId')
3353 if not video_id:
3354 continue
3355 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3356 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3357 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3358 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3359 view_count = int_or_none(self._search_regex(
3360 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3361 'view count', default=None))
3362 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3363 total += 1
3364 yield {
3365 '_type': 'url_transparent',
3366 'ie_key': YoutubeIE.ie_key(),
3367 'id': video_id,
3368 'url': video_id,
3369 'title': title,
3370 'description': description,
3371 'duration': duration,
3372 'view_count': view_count,
3373 'uploader': uploader,
3374 }
3375 if total == n:
3376 return
3377 token = try_get(
3378 slr_contents,
3379 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3380 compat_str)
3381 if not token:
3382 break
3383 data['continuation'] = token
3384
3385 def _get_n_results(self, query, n):
3386 """Get a specified number of results for a query"""
3387 return self.playlist_result(self._entries(query, n), query)
3388
3389
3390class YoutubeSearchDateIE(YoutubeSearchIE):
3391 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3392 _SEARCH_KEY = 'ytsearchdate'
3393 IE_DESC = 'YouTube.com searches, newest videos first'
3394 _SEARCH_PARAMS = 'CAI%3D'
3395
3396
3397class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
3398 IE_DESC = 'YouTube.com search URLs'
3399 IE_NAME = 'youtube:search_url'
3400 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3401 _TESTS = [{
3402 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3403 'playlist_mincount': 5,
3404 'info_dict': {
3405 'title': 'youtube-dl test video',
3406 }
3407 }, {
3408 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3409 'only_matching': True,
3410 }]
3411
3412 def _process_json_dict(self, obj, videos, c):
3413 if "videoId" in obj:
3414 videos.append(obj)
3415 return
3416
3417 if "nextContinuationData" in obj:
3418 c["continuation"] = obj["nextContinuationData"]
3419 return
3420
3421 def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
3422 search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
3423
3424 result_items = self._find_videos_in_json(search_response)
3425
3426 for renderer in result_items:
3427 video_id = try_get(renderer, lambda x: x['videoId'])
3428 video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
3429
3430 if video_id is None or video_title is None:
3431 # we do not have a videoRenderer or title extraction broke
3432 continue
3433
3434 video_title = video_title.strip()
3435
3436 try:
3437 idx = ids_in_page.index(video_id)
3438 if video_title and not titles_in_page[idx]:
3439 titles_in_page[idx] = video_title
3440 except ValueError:
3441 ids_in_page.append(video_id)
3442 titles_in_page.append(video_title)
3443
3444 def extract_videos_from_page(self, page):
3445 ids_in_page = []
3446 titles_in_page = []
3447 self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
3448 return zip(ids_in_page, titles_in_page)
3449
3450 def _real_extract(self, url):
3451 mobj = re.match(self._VALID_URL, url)
3452 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3453 webpage = self._download_webpage(url, query)
3454 # data_json = self._process_initial_data(webpage)
3455 return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query)
3456
3457
3458class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3459 IE_DESC = 'YouTube.com (multi-season) shows'
3460 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3461 IE_NAME = 'youtube:show'
3462 _TESTS = [{
3463 'url': 'https://www.youtube.com/show/airdisasters',
3464 'playlist_mincount': 5,
3465 'info_dict': {
3466 'id': 'airdisasters',
3467 'title': 'Air Disasters',
3468 }
3469 }]
3470
3471 def _real_extract(self, url):
3472 playlist_id = self._match_id(url)
3473 return super(YoutubeShowIE, self)._real_extract(
3474 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3475
3476
3477class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3478 """
3479 Base class for feed extractors
3480 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3481 """
3482 _LOGIN_REQUIRED = True
3483
3484 @property
3485 def IE_NAME(self):
3486 return 'youtube:%s' % self._FEED_NAME
3487
3488 def _real_initialize(self):
3489 self._login()
3490
3491 def _process_entries(self, entries, seen):
3492 new_info = []
3493 for v in entries:
3494 v_id = try_get(v, lambda x: x['videoId'])
3495 if not v_id:
3496 continue
3497
3498 have_video = False
3499 for old in seen:
3500 if old['videoId'] == v_id:
3501 have_video = True
3502 break
3503
3504 if not have_video:
3505 new_info.append(v)
3506
3507 if not new_info:
3508 return
3509
3510 seen.extend(new_info)
3511 for video in new_info:
3512 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
3513
3514 def _real_extract(self, url):
3515 page = self._download_webpage(
3516 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3517 self._PLAYLIST_TITLE)
3518 return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
3519 playlist_title=self._PLAYLIST_TITLE)
3520
3521
3522class YoutubeWatchLaterIE(YoutubePlaylistIE):
3523 IE_NAME = 'youtube:watchlater'
3524 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3525 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3526
3527 _TESTS = [{
3528 'url': 'https://www.youtube.com/playlist?list=WL',
3529 'only_matching': True,
3530 }, {
3531 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3532 'only_matching': True,
3533 }]
3534
3535 def _real_extract(self, url):
3536 _, video = self._check_download_just_video(url, 'WL')
3537 if video:
3538 return video
3539 _, playlist = self._extract_playlist('WL')
3540 return playlist
3541
3542
3543class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3544 IE_NAME = 'youtube:favorites'
3545 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3546 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3547 _LOGIN_REQUIRED = True
3548
3549 def _real_extract(self, url):
3550 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3551 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3552 return self.url_result(playlist_id, 'YoutubePlaylist')
3553
3554
3555class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3556 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3557 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3558 _FEED_NAME = 'recommended'
3559 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3560
3561
3562class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3563 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3564 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3565 _FEED_NAME = 'subscriptions'
3566 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3567
3568
3569class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3570 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3571 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3572 _FEED_NAME = 'history'
3573 _PLAYLIST_TITLE = 'Youtube History'
3574
3575
3576class YoutubeTruncatedURLIE(InfoExtractor):
3577 IE_NAME = 'youtube:truncated_url'
3578 IE_DESC = False # Do not list
3579 _VALID_URL = r'''(?x)
3580 (?:https?://)?
3581 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3582 (?:watch\?(?:
3583 feature=[a-z_]+|
3584 annotation_id=annotation_[^&]+|
3585 x-yt-cl=[0-9]+|
3586 hl=[^&]*|
3587 t=[0-9]+
3588 )?
3589 |
3590 attribution_link\?a=[^&]+
3591 )
3592 $
3593 '''
3594
3595 _TESTS = [{
3596 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3597 'only_matching': True,
3598 }, {
3599 'url': 'https://www.youtube.com/watch?',
3600 'only_matching': True,
3601 }, {
3602 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3603 'only_matching': True,
3604 }, {
3605 'url': 'https://www.youtube.com/watch?feature=foo',
3606 'only_matching': True,
3607 }, {
3608 'url': 'https://www.youtube.com/watch?hl=en-GB',
3609 'only_matching': True,
3610 }, {
3611 'url': 'https://www.youtube.com/watch?t=2372',
3612 'only_matching': True,
3613 }]
3614
3615 def _real_extract(self, url):
3616 raise ExtractorError(
3617 'Did you forget to quote the URL? Remember that & is a meta '
3618 'character in most shells, so you want to put the URL in quotes, '
3619 'like youtube-dl '
3620 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3621 ' or simply youtube-dl BaW_jenozKc .',
3622 expected=True)
3623
3624
3625class YoutubeTruncatedIDIE(InfoExtractor):
3626 IE_NAME = 'youtube:truncated_id'
3627 IE_DESC = False # Do not list
3628 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3629
3630 _TESTS = [{
3631 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3632 'only_matching': True,
3633 }]
3634
3635 def _real_extract(self, url):
3636 video_id = self._match_id(url)
3637 raise ExtractorError(
3638 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3639 expected=True)