]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dlc/extractor/youtube.py
Merge pull request #187 from pukkandan/break-on-existing
[yt-dlp.git] / youtube_dlc / extractor / youtube.py
... / ...
CommitLineData
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import random
10import re
11import time
12import traceback
13
14from .common import InfoExtractor, SearchInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28)
29from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 js_to_json,
40 mimetype2ext,
41 orderedSet,
42 parse_codecs,
43 parse_count,
44 parse_duration,
45 remove_quotes,
46 remove_start,
47 smuggle_url,
48 str_or_none,
49 str_to_int,
50 try_get,
51 unescapeHTML,
52 unified_strdate,
53 unsmuggle_url,
54 uppercase_escape,
55 url_or_none,
56 urlencode_postdata,
57)
58
59
60class YoutubeBaseInfoExtractor(InfoExtractor):
61 """Provide base functions for Youtube extractors"""
62 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
63 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
64
65 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
66 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
67 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
68
69 _NETRC_MACHINE = 'youtube'
70 # If True it will raise an error if no login info is provided
71 _LOGIN_REQUIRED = False
72
73 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
74 _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
75 _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
76
77 _YOUTUBE_CLIENT_HEADERS = {
78 'x-youtube-client-name': '1',
79 'x-youtube-client-version': '1.20200609.04.02',
80 }
81
82 def _set_language(self):
83 self._set_cookie(
84 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
85 # YouTube sets the expire time to about two months
86 expire_time=time.time() + 2 * 30 * 24 * 3600)
87
88 def _ids_to_results(self, ids):
89 return [
90 self.url_result(vid_id, 'Youtube', video_id=vid_id)
91 for vid_id in ids]
92
93 def _login(self):
94 """
95 Attempt to log in to YouTube.
96 True is returned if successful or skipped.
97 False is returned if login failed.
98
99 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
100 """
101 username, password = self._get_login_info()
102 # No authentication to be performed
103 if username is None:
104 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
105 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
106 if self._downloader.params.get('cookiefile') and False: # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
107 self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
108 return True
109
110 login_page = self._download_webpage(
111 self._LOGIN_URL, None,
112 note='Downloading login page',
113 errnote='unable to fetch login page', fatal=False)
114 if login_page is False:
115 return
116
117 login_form = self._hidden_inputs(login_page)
118
119 def req(url, f_req, note, errnote):
120 data = login_form.copy()
121 data.update({
122 'pstMsg': 1,
123 'checkConnection': 'youtube',
124 'checkedDomains': 'youtube',
125 'hl': 'en',
126 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
127 'f.req': json.dumps(f_req),
128 'flowName': 'GlifWebSignIn',
129 'flowEntry': 'ServiceLogin',
130 # TODO: reverse actual botguard identifier generation algo
131 'bgRequest': '["identifier",""]',
132 })
133 return self._download_json(
134 url, None, note=note, errnote=errnote,
135 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
136 fatal=False,
137 data=urlencode_postdata(data), headers={
138 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
139 'Google-Accounts-XSRF': 1,
140 })
141
142 def warn(message):
143 self._downloader.report_warning(message)
144
145 lookup_req = [
146 username,
147 None, [], None, 'US', None, None, 2, False, True,
148 [
149 None, None,
150 [2, 1, None, 1,
151 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
152 None, [], 4],
153 1, [None, None, []], None, None, None, True
154 ],
155 username,
156 ]
157
158 lookup_results = req(
159 self._LOOKUP_URL, lookup_req,
160 'Looking up account info', 'Unable to look up account info')
161
162 if lookup_results is False:
163 return False
164
165 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
166 if not user_hash:
167 warn('Unable to extract user hash')
168 return False
169
170 challenge_req = [
171 user_hash,
172 None, 1, None, [1, None, None, None, [password, None, True]],
173 [
174 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
175 1, [None, None, []], None, None, None, True
176 ]]
177
178 challenge_results = req(
179 self._CHALLENGE_URL, challenge_req,
180 'Logging in', 'Unable to log in')
181
182 if challenge_results is False:
183 return
184
185 login_res = try_get(challenge_results, lambda x: x[0][5], list)
186 if login_res:
187 login_msg = try_get(login_res, lambda x: x[5], compat_str)
188 warn(
189 'Unable to login: %s' % 'Invalid password'
190 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
191 return False
192
193 res = try_get(challenge_results, lambda x: x[0][-1], list)
194 if not res:
195 warn('Unable to extract result entry')
196 return False
197
198 login_challenge = try_get(res, lambda x: x[0][0], list)
199 if login_challenge:
200 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
201 if challenge_str == 'TWO_STEP_VERIFICATION':
202 # SEND_SUCCESS - TFA code has been successfully sent to phone
203 # QUOTA_EXCEEDED - reached the limit of TFA codes
204 status = try_get(login_challenge, lambda x: x[5], compat_str)
205 if status == 'QUOTA_EXCEEDED':
206 warn('Exceeded the limit of TFA codes, try later')
207 return False
208
209 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
210 if not tl:
211 warn('Unable to extract TL')
212 return False
213
214 tfa_code = self._get_tfa_info('2-step verification code')
215
216 if not tfa_code:
217 warn(
218 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
219 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
220 return False
221
222 tfa_code = remove_start(tfa_code, 'G-')
223
224 tfa_req = [
225 user_hash, None, 2, None,
226 [
227 9, None, None, None, None, None, None, None,
228 [None, tfa_code, True, 2]
229 ]]
230
231 tfa_results = req(
232 self._TFA_URL.format(tl), tfa_req,
233 'Submitting TFA code', 'Unable to submit TFA code')
234
235 if tfa_results is False:
236 return False
237
238 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
239 if tfa_res:
240 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
241 warn(
242 'Unable to finish TFA: %s' % 'Invalid TFA code'
243 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
244 return False
245
246 check_cookie_url = try_get(
247 tfa_results, lambda x: x[0][-1][2], compat_str)
248 else:
249 CHALLENGES = {
250 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
251 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
252 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
253 }
254 challenge = CHALLENGES.get(
255 challenge_str,
256 '%s returned error %s.' % (self.IE_NAME, challenge_str))
257 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
258 return False
259 else:
260 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
261
262 if not check_cookie_url:
263 warn('Unable to extract CheckCookie URL')
264 return False
265
266 check_cookie_results = self._download_webpage(
267 check_cookie_url, None, 'Checking cookie', fatal=False)
268
269 if check_cookie_results is False:
270 return False
271
272 if 'https://myaccount.google.com/' not in check_cookie_results:
273 warn('Unable to log in')
274 return False
275
276 return True
277
278 def _download_webpage_handle(self, *args, **kwargs):
279 query = kwargs.get('query', {}).copy()
280 kwargs['query'] = query
281 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
282 *args, **compat_kwargs(kwargs))
283
284 def _get_yt_initial_data(self, video_id, webpage):
285 config = self._search_regex(
286 (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
287 r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
288 webpage, 'ytInitialData', default=None)
289 if config:
290 return self._parse_json(
291 uppercase_escape(config), video_id, fatal=False)
292
293 def _real_initialize(self):
294 if self._downloader is None:
295 return
296 self._set_language()
297 if not self._login():
298 return
299
300
301class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
302
303 def _find_entries_in_json(self, extracted):
304 entries = []
305 c = {}
306
307 def _real_find(obj):
308 if obj is None or isinstance(obj, str):
309 return
310
311 if type(obj) is list:
312 for elem in obj:
313 _real_find(elem)
314
315 if type(obj) is dict:
316 if self._is_entry(obj):
317 entries.append(obj)
318 return
319
320 if 'continuationCommand' in obj:
321 c['continuation'] = obj
322 return
323
324 for _, o in obj.items():
325 _real_find(o)
326
327 _real_find(extracted)
328
329 return entries, try_get(c, lambda x: x["continuation"])
330
331 def _entries(self, page, playlist_id, max_pages=None):
332 seen = []
333
334 yt_conf = {}
335 for m in re.finditer(self._YTCFG_DATA_RE, page):
336 parsed = self._parse_json(m.group(1), playlist_id,
337 transform_source=js_to_json, fatal=False)
338 if parsed:
339 yt_conf.update(parsed)
340
341 data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
342
343 for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1):
344 entries, continuation = self._find_entries_in_json(data_json)
345 processed = self._process_entries(entries, seen)
346
347 if not processed:
348 break
349 for entry in processed:
350 yield entry
351
352 if not continuation or not yt_conf:
353 break
354 continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
355 continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
356 if not continuation_token or not continuation_url:
357 break
358
359 count = 0
360 retries = 3
361 while count <= retries:
362 try:
363 # Downloading page may result in intermittent 5xx HTTP error
364 # that is usually worked around with a retry
365 data_json = self._download_json(
366 'https://www.youtube.com%s' % continuation_url,
367 playlist_id,
368 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
369
370 transform_source=uppercase_escape,
371 query={
372 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
373 },
374 data=str(json.dumps({
375 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
376 'continuation': continuation_token
377 })).encode(encoding='UTF-8', errors='strict'),
378 headers={
379 'Content-Type': 'application/json'
380 }
381 )
382 break
383 except ExtractorError as e:
384 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
385 count += 1
386 if count <= retries:
387 continue
388 raise
389
390 def _extract_title(self, renderer):
391 title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
392 if title:
393 return title
394 return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
395
396
397class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
398 def _is_entry(self, obj):
399 return 'videoId' in obj
400
401 def _process_entries(self, entries, seen):
402 ids_in_page = []
403 titles_in_page = []
404 for renderer in entries:
405 video_id = try_get(renderer, lambda x: x['videoId'])
406 video_title = self._extract_title(renderer)
407
408 if video_id is None or video_title is None:
409 # we do not have a videoRenderer or title extraction broke
410 continue
411
412 video_title = video_title.strip()
413
414 try:
415 idx = ids_in_page.index(video_id)
416 if video_title and not titles_in_page[idx]:
417 titles_in_page[idx] = video_title
418 except ValueError:
419 ids_in_page.append(video_id)
420 titles_in_page.append(video_title)
421
422 for video_id, video_title in zip(ids_in_page, titles_in_page):
423 yield self.url_result(video_id, 'Youtube', video_id, video_title)
424
425
426class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
427 def _is_entry(self, obj):
428 return 'playlistId' in obj
429
430 def _process_entries(self, entries, seen):
431 for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
432
433 yield self.url_result(
434 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
435
436 def _real_extract(self, url):
437 playlist_id = self._match_id(url)
438 webpage = self._download_webpage(url, playlist_id)
439 title = self._og_search_title(webpage, fatal=False)
440 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
441
442
443class YoutubeIE(YoutubeBaseInfoExtractor):
444 IE_DESC = 'YouTube.com'
445 _VALID_URL = r"""(?x)^
446 (
447 (?:https?://|//) # http(s):// or protocol-independent URL
448 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
449 (?:www\.)?deturl\.com/www\.youtube\.com/|
450 (?:www\.)?pwnyoutube\.com/|
451 (?:www\.)?hooktube\.com/|
452 (?:www\.)?yourepeat\.com/|
453 tube\.majestyc\.net/|
454 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
455 (?:(?:www|dev)\.)?invidio\.us/|
456 (?:(?:www|no)\.)?invidiou\.sh/|
457 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
458 (?:www\.)?invidious\.kabi\.tk/|
459 (?:www\.)?invidious\.13ad\.de/|
460 (?:www\.)?invidious\.mastodon\.host/|
461 (?:www\.)?invidious\.nixnet\.xyz/|
462 (?:www\.)?invidious\.drycat\.fr/|
463 (?:www\.)?tube\.poal\.co/|
464 (?:www\.)?vid\.wxzm\.sx/|
465 (?:www\.)?yewtu\.be/|
466 (?:www\.)?yt\.elukerio\.org/|
467 (?:www\.)?yt\.lelux\.fi/|
468 (?:www\.)?invidious\.ggc-project\.de/|
469 (?:www\.)?yt\.maisputain\.ovh/|
470 (?:www\.)?invidious\.13ad\.de/|
471 (?:www\.)?invidious\.toot\.koeln/|
472 (?:www\.)?invidious\.fdn\.fr/|
473 (?:www\.)?watch\.nettohikari\.com/|
474 (?:www\.)?kgg2m7yk5aybusll\.onion/|
475 (?:www\.)?qklhadlycap4cnod\.onion/|
476 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
477 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
478 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
479 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
480 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
481 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
482 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
483 (?:.*?\#/)? # handle anchor (#/) redirect urls
484 (?: # the various things that can precede the ID:
485 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
486 |(?: # or the v= param in all its forms
487 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
488 (?:\?|\#!?) # the params delimiter ? or # or #!
489 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
490 v=
491 )
492 ))
493 |(?:
494 youtu\.be| # just youtu.be/xxxx
495 vid\.plus| # or vid.plus/xxxx
496 zwearz\.com/watch| # or zwearz.com/watch/xxxx
497 )/
498 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
499 )
500 )? # all until now is optional -> you can pass the naked ID
501 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
502 (?!.*?\blist=
503 (?:
504 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
505 WL # WL are handled by the watch later IE
506 )
507 )
508 (?(1).+)? # if we found the ID, everything can follow
509 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
510 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
511 _PLAYER_INFO_RE = (
512 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
513 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
514 )
515 _formats = {
516 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
517 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
518 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
519 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
520 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
521 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
522 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
523 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
524 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
525 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
526 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
527 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
528 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
529 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
530 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
531 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
532 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
533 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
534
535
536 # 3D videos
537 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
538 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
539 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
540 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
541 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
542 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
543 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
544
545 # Apple HTTP Live Streaming
546 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
547 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
548 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
549 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
550 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
551 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
552 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
553 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
554
555 # DASH mp4 video
556 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
557 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
558 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
559 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
560 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
561 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
562 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
563 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
564 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
565 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
566 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
567 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
568
569 # Dash mp4 audio
570 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
571 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
572 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
573 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
574 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
575 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
576 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
577
578 # Dash webm
579 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
580 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
581 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
582 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
583 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
584 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
585 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
586 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
587 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
588 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
589 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
590 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
591 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
592 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
593 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
594 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
595 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
596 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
597 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
598 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
599 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
600 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
601
602 # Dash webm audio
603 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
604 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
605
606 # Dash webm audio with opus inside
607 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
608 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
609 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
610
611 # RTMP (unnamed)
612 '_rtmp': {'protocol': 'rtmp'},
613
614 # av01 video only formats sometimes served with "unknown" codecs
615 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
616 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
617 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
618 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
619 }
620 _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
621
622 _GEO_BYPASS = False
623
624 IE_NAME = 'youtube'
625 _TESTS = [
626 {
627 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
628 'info_dict': {
629 'id': 'BaW_jenozKc',
630 'ext': 'mp4',
631 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
632 'uploader': 'Philipp Hagemeister',
633 'uploader_id': 'phihag',
634 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
635 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
636 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
637 'upload_date': '20121002',
638 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
639 'categories': ['Science & Technology'],
640 'tags': ['youtube-dl'],
641 'duration': 10,
642 'view_count': int,
643 'like_count': int,
644 'dislike_count': int,
645 'start_time': 1,
646 'end_time': 9,
647 }
648 },
649 {
650 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
651 'note': 'Embed-only video (#1746)',
652 'info_dict': {
653 'id': 'yZIXLfi8CZQ',
654 'ext': 'mp4',
655 'upload_date': '20120608',
656 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
657 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
658 'uploader': 'SET India',
659 'uploader_id': 'setindia',
660 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
661 'age_limit': 18,
662 }
663 },
664 {
665 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
666 'note': 'Use the first video ID in the URL',
667 'info_dict': {
668 'id': 'BaW_jenozKc',
669 'ext': 'mp4',
670 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
671 'uploader': 'Philipp Hagemeister',
672 'uploader_id': 'phihag',
673 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
674 'upload_date': '20121002',
675 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
676 'categories': ['Science & Technology'],
677 'tags': ['youtube-dl'],
678 'duration': 10,
679 'view_count': int,
680 'like_count': int,
681 'dislike_count': int,
682 },
683 'params': {
684 'skip_download': True,
685 },
686 },
687 {
688 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
689 'note': '256k DASH audio (format 141) via DASH manifest',
690 'info_dict': {
691 'id': 'a9LDPn-MO4I',
692 'ext': 'm4a',
693 'upload_date': '20121002',
694 'uploader_id': '8KVIDEO',
695 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
696 'description': '',
697 'uploader': '8KVIDEO',
698 'title': 'UHDTV TEST 8K VIDEO.mp4'
699 },
700 'params': {
701 'youtube_include_dash_manifest': True,
702 'format': '141',
703 },
704 'skip': 'format 141 not served anymore',
705 },
706 # Controversy video
707 {
708 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
709 'info_dict': {
710 'id': 'T4XJQO3qol8',
711 'ext': 'mp4',
712 'duration': 219,
713 'upload_date': '20100909',
714 'uploader': 'Amazing Atheist',
715 'uploader_id': 'TheAmazingAtheist',
716 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
717 'title': 'Burning Everyone\'s Koran',
718 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
719 }
720 },
721 # Normal age-gate video (embed allowed)
722 {
723 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
724 'info_dict': {
725 'id': 'HtVdAasjOgU',
726 'ext': 'mp4',
727 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
728 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
729 'duration': 142,
730 'uploader': 'The Witcher',
731 'uploader_id': 'WitcherGame',
732 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
733 'upload_date': '20140605',
734 'age_limit': 18,
735 },
736 },
737 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
738 {
739 'url': 'lqQg6PlCWgI',
740 'info_dict': {
741 'id': 'lqQg6PlCWgI',
742 'ext': 'mp4',
743 'duration': 6085,
744 'upload_date': '20150827',
745 'uploader_id': 'olympic',
746 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
747 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
748 'uploader': 'Olympic',
749 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
750 },
751 'params': {
752 'skip_download': 'requires avconv',
753 }
754 },
755 # Non-square pixels
756 {
757 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
758 'info_dict': {
759 'id': '_b-2C3KPAM0',
760 'ext': 'mp4',
761 'stretched_ratio': 16 / 9.,
762 'duration': 85,
763 'upload_date': '20110310',
764 'uploader_id': 'AllenMeow',
765 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
766 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
767 'uploader': '孫ᄋᄅ',
768 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
769 },
770 },
771 # url_encoded_fmt_stream_map is empty string
772 {
773 'url': 'qEJwOuvDf7I',
774 'info_dict': {
775 'id': 'qEJwOuvDf7I',
776 'ext': 'webm',
777 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
778 'description': '',
779 'upload_date': '20150404',
780 'uploader_id': 'spbelect',
781 'uploader': 'Наблюдатели Петербурга',
782 },
783 'params': {
784 'skip_download': 'requires avconv',
785 },
786 'skip': 'This live event has ended.',
787 },
788 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
789 {
790 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
791 'info_dict': {
792 'id': 'FIl7x6_3R5Y',
793 'ext': 'webm',
794 'title': 'md5:7b81415841e02ecd4313668cde88737a',
795 'description': 'md5:116377fd2963b81ec4ce64b542173306',
796 'duration': 220,
797 'upload_date': '20150625',
798 'uploader_id': 'dorappi2000',
799 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
800 'uploader': 'dorappi2000',
801 'formats': 'mincount:31',
802 },
803 'skip': 'not actual anymore',
804 },
805 # DASH manifest with segment_list
806 {
807 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
808 'md5': '8ce563a1d667b599d21064e982ab9e31',
809 'info_dict': {
810 'id': 'CsmdDsKjzN8',
811 'ext': 'mp4',
812 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
813 'uploader': 'Airtek',
814 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
815 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
816 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
817 },
818 'params': {
819 'youtube_include_dash_manifest': True,
820 'format': '135', # bestvideo
821 },
822 'skip': 'This live event has ended.',
823 },
824 {
825 # Multifeed videos (multiple cameras), URL is for Main Camera
826 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
827 'info_dict': {
828 'id': 'jqWvoWXjCVs',
829 'title': 'teamPGP: Rocket League Noob Stream',
830 'description': 'md5:dc7872fb300e143831327f1bae3af010',
831 },
832 'playlist': [{
833 'info_dict': {
834 'id': 'jqWvoWXjCVs',
835 'ext': 'mp4',
836 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
837 'description': 'md5:dc7872fb300e143831327f1bae3af010',
838 'duration': 7335,
839 'upload_date': '20150721',
840 'uploader': 'Beer Games Beer',
841 'uploader_id': 'beergamesbeer',
842 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
843 'license': 'Standard YouTube License',
844 },
845 }, {
846 'info_dict': {
847 'id': '6h8e8xoXJzg',
848 'ext': 'mp4',
849 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
850 'description': 'md5:dc7872fb300e143831327f1bae3af010',
851 'duration': 7337,
852 'upload_date': '20150721',
853 'uploader': 'Beer Games Beer',
854 'uploader_id': 'beergamesbeer',
855 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
856 'license': 'Standard YouTube License',
857 },
858 }, {
859 'info_dict': {
860 'id': 'PUOgX5z9xZw',
861 'ext': 'mp4',
862 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
863 'description': 'md5:dc7872fb300e143831327f1bae3af010',
864 'duration': 7337,
865 'upload_date': '20150721',
866 'uploader': 'Beer Games Beer',
867 'uploader_id': 'beergamesbeer',
868 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
869 'license': 'Standard YouTube License',
870 },
871 }, {
872 'info_dict': {
873 'id': 'teuwxikvS5k',
874 'ext': 'mp4',
875 'title': 'teamPGP: Rocket League Noob Stream (zim)',
876 'description': 'md5:dc7872fb300e143831327f1bae3af010',
877 'duration': 7334,
878 'upload_date': '20150721',
879 'uploader': 'Beer Games Beer',
880 'uploader_id': 'beergamesbeer',
881 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
882 'license': 'Standard YouTube License',
883 },
884 }],
885 'params': {
886 'skip_download': True,
887 },
888 'skip': 'This video is not available.',
889 },
890 {
891 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
892 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
893 'info_dict': {
894 'id': 'gVfLd0zydlo',
895 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
896 },
897 'playlist_count': 2,
898 'skip': 'Not multifeed anymore',
899 },
900 {
901 'url': 'https://vid.plus/FlRa-iH7PGw',
902 'only_matching': True,
903 },
904 {
905 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
906 'only_matching': True,
907 },
908 {
909 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
910 # Also tests cut-off URL expansion in video description (see
911 # https://github.com/ytdl-org/youtube-dl/issues/1892,
912 # https://github.com/ytdl-org/youtube-dl/issues/8164)
913 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
914 'info_dict': {
915 'id': 'lsguqyKfVQg',
916 'ext': 'mp4',
917 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
918 'alt_title': 'Dark Walk - Position Music',
919 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
920 'duration': 133,
921 'upload_date': '20151119',
922 'uploader_id': 'IronSoulElf',
923 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
924 'uploader': 'IronSoulElf',
925 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
926 'track': 'Dark Walk - Position Music',
927 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
928 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
929 },
930 'params': {
931 'skip_download': True,
932 },
933 },
934 {
935 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
936 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
937 'only_matching': True,
938 },
939 {
940 # Video with yt:stretch=17:0
941 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
942 'info_dict': {
943 'id': 'Q39EVAstoRM',
944 'ext': 'mp4',
945 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
946 'description': 'md5:ee18a25c350637c8faff806845bddee9',
947 'upload_date': '20151107',
948 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
949 'uploader': 'CH GAMER DROID',
950 },
951 'params': {
952 'skip_download': True,
953 },
954 'skip': 'This video does not exist.',
955 },
956 {
957 # Video licensed under Creative Commons
958 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
959 'info_dict': {
960 'id': 'M4gD1WSo5mA',
961 'ext': 'mp4',
962 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
963 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
964 'duration': 721,
965 'upload_date': '20150127',
966 'uploader_id': 'BerkmanCenter',
967 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
968 'uploader': 'The Berkman Klein Center for Internet & Society',
969 'license': 'Creative Commons Attribution license (reuse allowed)',
970 },
971 'params': {
972 'skip_download': True,
973 },
974 },
975 {
976 # Channel-like uploader_url
977 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
978 'info_dict': {
979 'id': 'eQcmzGIKrzg',
980 'ext': 'mp4',
981 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
982 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
983 'duration': 4060,
984 'upload_date': '20151119',
985 'uploader': 'Bernie Sanders',
986 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
987 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
988 'license': 'Creative Commons Attribution license (reuse allowed)',
989 },
990 'params': {
991 'skip_download': True,
992 },
993 },
994 {
995 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
996 'only_matching': True,
997 },
998 {
999 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1000 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1001 'only_matching': True,
1002 },
1003 {
1004 # Rental video preview
1005 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1006 'info_dict': {
1007 'id': 'uGpuVWrhIzE',
1008 'ext': 'mp4',
1009 'title': 'Piku - Trailer',
1010 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1011 'upload_date': '20150811',
1012 'uploader': 'FlixMatrix',
1013 'uploader_id': 'FlixMatrixKaravan',
1014 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1015 'license': 'Standard YouTube License',
1016 },
1017 'params': {
1018 'skip_download': True,
1019 },
1020 'skip': 'This video is not available.',
1021 },
1022 {
1023 # YouTube Red video with episode data
1024 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1025 'info_dict': {
1026 'id': 'iqKdEhx-dD4',
1027 'ext': 'mp4',
1028 'title': 'Isolation - Mind Field (Ep 1)',
1029 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1030 'duration': 2085,
1031 'upload_date': '20170118',
1032 'uploader': 'Vsauce',
1033 'uploader_id': 'Vsauce',
1034 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1035 'series': 'Mind Field',
1036 'season_number': 1,
1037 'episode_number': 1,
1038 },
1039 'params': {
1040 'skip_download': True,
1041 },
1042 'expected_warnings': [
1043 'Skipping DASH manifest',
1044 ],
1045 },
1046 {
1047 # The following content has been identified by the YouTube community
1048 # as inappropriate or offensive to some audiences.
1049 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1050 'info_dict': {
1051 'id': '6SJNVb0GnPI',
1052 'ext': 'mp4',
1053 'title': 'Race Differences in Intelligence',
1054 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1055 'duration': 965,
1056 'upload_date': '20140124',
1057 'uploader': 'New Century Foundation',
1058 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1059 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1060 },
1061 'params': {
1062 'skip_download': True,
1063 },
1064 },
1065 {
1066 # itag 212
1067 'url': '1t24XAntNCY',
1068 'only_matching': True,
1069 },
1070 {
1071 # geo restricted to JP
1072 'url': 'sJL6WA-aGkQ',
1073 'only_matching': True,
1074 },
1075 {
1076 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1077 'only_matching': True,
1078 },
1079 {
1080 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1081 'only_matching': True,
1082 },
1083 {
1084 # DRM protected
1085 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1086 'only_matching': True,
1087 },
1088 {
1089 # Video with unsupported adaptive stream type formats
1090 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1091 'info_dict': {
1092 'id': 'Z4Vy8R84T1U',
1093 'ext': 'mp4',
1094 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1095 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1096 'duration': 433,
1097 'upload_date': '20130923',
1098 'uploader': 'Amelia Putri Harwita',
1099 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1100 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1101 'formats': 'maxcount:10',
1102 },
1103 'params': {
1104 'skip_download': True,
1105 'youtube_include_dash_manifest': False,
1106 },
1107 'skip': 'not actual anymore',
1108 },
1109 {
1110 # Youtube Music Auto-generated description
1111 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1112 'info_dict': {
1113 'id': 'MgNrAu2pzNs',
1114 'ext': 'mp4',
1115 'title': 'Voyeur Girl',
1116 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1117 'upload_date': '20190312',
1118 'uploader': 'Stephen - Topic',
1119 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1120 'artist': 'Stephen',
1121 'track': 'Voyeur Girl',
1122 'album': 'it\'s too much love to know my dear',
1123 'release_date': '20190313',
1124 'release_year': 2019,
1125 },
1126 'params': {
1127 'skip_download': True,
1128 },
1129 },
1130 {
1131 # Youtube Music Auto-generated description
1132 # Retrieve 'artist' field from 'Artist:' in video description
1133 # when it is present on youtube music video
1134 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1135 'info_dict': {
1136 'id': 'k0jLE7tTwjY',
1137 'ext': 'mp4',
1138 'title': 'Latch Feat. Sam Smith',
1139 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1140 'upload_date': '20150110',
1141 'uploader': 'Various Artists - Topic',
1142 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1143 'artist': 'Disclosure',
1144 'track': 'Latch Feat. Sam Smith',
1145 'album': 'Latch Featuring Sam Smith',
1146 'release_date': '20121008',
1147 'release_year': 2012,
1148 },
1149 'params': {
1150 'skip_download': True,
1151 },
1152 },
1153 {
1154 # Youtube Music Auto-generated description
1155 # handle multiple artists on youtube music video
1156 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1157 'info_dict': {
1158 'id': '74qn0eJSjpA',
1159 'ext': 'mp4',
1160 'title': 'Eastside',
1161 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1162 'upload_date': '20180710',
1163 'uploader': 'Benny Blanco - Topic',
1164 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1165 'artist': 'benny blanco, Halsey, Khalid',
1166 'track': 'Eastside',
1167 'album': 'Eastside',
1168 'release_date': '20180713',
1169 'release_year': 2018,
1170 },
1171 'params': {
1172 'skip_download': True,
1173 },
1174 },
1175 {
1176 # Youtube Music Auto-generated description
1177 # handle youtube music video with release_year and no release_date
1178 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1179 'info_dict': {
1180 'id': '-hcAI0g-f5M',
1181 'ext': 'mp4',
1182 'title': 'Put It On Me',
1183 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1184 'upload_date': '20180426',
1185 'uploader': 'Matt Maeson - Topic',
1186 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1187 'artist': 'Matt Maeson',
1188 'track': 'Put It On Me',
1189 'album': 'The Hearse',
1190 'release_date': None,
1191 'release_year': 2018,
1192 },
1193 'params': {
1194 'skip_download': True,
1195 },
1196 },
1197 {
1198 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1199 'only_matching': True,
1200 },
1201 {
1202 # invalid -> valid video id redirection
1203 'url': 'DJztXj2GPfl',
1204 'info_dict': {
1205 'id': 'DJztXj2GPfk',
1206 'ext': 'mp4',
1207 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1208 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1209 'upload_date': '20090125',
1210 'uploader': 'Prochorowka',
1211 'uploader_id': 'Prochorowka',
1212 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1213 'artist': 'Panjabi MC',
1214 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1215 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1216 },
1217 'params': {
1218 'skip_download': True,
1219 },
1220 },
1221 {
1222 # empty description results in an empty string
1223 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1224 'info_dict': {
1225 'id': 'x41yOUIvK2k',
1226 'ext': 'mp4',
1227 'title': 'IMG 3456',
1228 'description': '',
1229 'upload_date': '20170613',
1230 'uploader_id': 'ElevageOrVert',
1231 'uploader': 'ElevageOrVert',
1232 },
1233 'params': {
1234 'skip_download': True,
1235 },
1236 },
1237 ]
1238
1239 def __init__(self, *args, **kwargs):
1240 super(YoutubeIE, self).__init__(*args, **kwargs)
1241 self._player_cache = {}
1242
1243 def report_video_info_webpage_download(self, video_id):
1244 """Report attempt to download video info webpage."""
1245 self.to_screen('%s: Downloading video info webpage' % video_id)
1246
1247 def report_information_extraction(self, video_id):
1248 """Report attempt to extract video information."""
1249 self.to_screen('%s: Extracting video information' % video_id)
1250
1251 def report_unavailable_format(self, video_id, format):
1252 """Report extracted video URL."""
1253 self.to_screen('%s: Format %s not available' % (video_id, format))
1254
1255 def report_rtmp_download(self):
1256 """Indicate the download will use the RTMP protocol."""
1257 self.to_screen('RTMP download detected')
1258
1259 def _signature_cache_id(self, example_sig):
1260 """ Return a string representation of a signature """
1261 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1262
1263 @classmethod
1264 def _extract_player_info(cls, player_url):
1265 for player_re in cls._PLAYER_INFO_RE:
1266 id_m = re.search(player_re, player_url)
1267 if id_m:
1268 break
1269 else:
1270 raise ExtractorError('Cannot identify player %r' % player_url)
1271 return id_m.group('ext'), id_m.group('id')
1272
1273 def _extract_signature_function(self, video_id, player_url, example_sig):
1274 player_type, player_id = self._extract_player_info(player_url)
1275
1276 # Read from filesystem cache
1277 func_id = '%s_%s_%s' % (
1278 player_type, player_id, self._signature_cache_id(example_sig))
1279 assert os.path.basename(func_id) == func_id
1280
1281 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1282 if cache_spec is not None:
1283 return lambda s: ''.join(s[i] for i in cache_spec)
1284
1285 download_note = (
1286 'Downloading player %s' % player_url
1287 if self._downloader.params.get('verbose') else
1288 'Downloading %s player %s' % (player_type, player_id)
1289 )
1290 if player_type == 'js':
1291 code = self._download_webpage(
1292 player_url, video_id,
1293 note=download_note,
1294 errnote='Download of %s failed' % player_url)
1295 res = self._parse_sig_js(code)
1296 elif player_type == 'swf':
1297 urlh = self._request_webpage(
1298 player_url, video_id,
1299 note=download_note,
1300 errnote='Download of %s failed' % player_url)
1301 code = urlh.read()
1302 res = self._parse_sig_swf(code)
1303 else:
1304 assert False, 'Invalid player type %r' % player_type
1305
1306 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1307 cache_res = res(test_string)
1308 cache_spec = [ord(c) for c in cache_res]
1309
1310 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1311 return res
1312
1313 def _print_sig_code(self, func, example_sig):
1314 def gen_sig_code(idxs):
1315 def _genslice(start, end, step):
1316 starts = '' if start == 0 else str(start)
1317 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1318 steps = '' if step == 1 else (':%d' % step)
1319 return 's[%s%s%s]' % (starts, ends, steps)
1320
1321 step = None
1322 # Quelch pyflakes warnings - start will be set when step is set
1323 start = '(Never used)'
1324 for i, prev in zip(idxs[1:], idxs[:-1]):
1325 if step is not None:
1326 if i - prev == step:
1327 continue
1328 yield _genslice(start, prev, step)
1329 step = None
1330 continue
1331 if i - prev in [-1, 1]:
1332 step = i - prev
1333 start = prev
1334 continue
1335 else:
1336 yield 's[%d]' % prev
1337 if step is None:
1338 yield 's[%d]' % i
1339 else:
1340 yield _genslice(start, i, step)
1341
1342 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1343 cache_res = func(test_string)
1344 cache_spec = [ord(c) for c in cache_res]
1345 expr_code = ' + '.join(gen_sig_code(cache_spec))
1346 signature_id_tuple = '(%s)' % (
1347 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1348 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1349 ' return %s\n') % (signature_id_tuple, expr_code)
1350 self.to_screen('Extracted signature function:\n' + code)
1351
1352 def _parse_sig_js(self, jscode):
1353 funcname = self._search_regex(
1354 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1355 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1356 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1357 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1358 # Obsolete patterns
1359 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1360 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1361 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1362 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1363 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1364 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1365 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1366 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1367 jscode, 'Initial JS player signature function name', group='sig')
1368
1369 jsi = JSInterpreter(jscode)
1370 initial_function = jsi.extract_function(funcname)
1371 return lambda s: initial_function([s])
1372
1373 def _parse_sig_swf(self, file_contents):
1374 swfi = SWFInterpreter(file_contents)
1375 TARGET_CLASSNAME = 'SignatureDecipher'
1376 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1377 initial_function = swfi.extract_function(searched_class, 'decipher')
1378 return lambda s: initial_function([s])
1379
1380 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1381 """Turn the encrypted s field into a working signature"""
1382
1383 if player_url is None:
1384 raise ExtractorError('Cannot decrypt signature without player_url')
1385
1386 if player_url.startswith('//'):
1387 player_url = 'https:' + player_url
1388 elif not re.match(r'https?://', player_url):
1389 player_url = compat_urlparse.urljoin(
1390 'https://www.youtube.com', player_url)
1391 try:
1392 player_id = (player_url, self._signature_cache_id(s))
1393 if player_id not in self._player_cache:
1394 func = self._extract_signature_function(
1395 video_id, player_url, s
1396 )
1397 self._player_cache[player_id] = func
1398 func = self._player_cache[player_id]
1399 if self._downloader.params.get('youtube_print_sig_code'):
1400 self._print_sig_code(func, s)
1401 return func(s)
1402 except Exception as e:
1403 tb = traceback.format_exc()
1404 raise ExtractorError(
1405 'Signature extraction failed: ' + tb, cause=e)
1406
1407 def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
1408 try:
1409 subs_doc = self._download_xml(
1410 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1411 video_id, note=False)
1412 except ExtractorError as err:
1413 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1414 return {}
1415
1416 sub_lang_list = {}
1417 for track in subs_doc.findall('track'):
1418 lang = track.attrib['lang_code']
1419 if lang in sub_lang_list:
1420 continue
1421 sub_formats = []
1422 for ext in self._SUBTITLE_FORMATS:
1423 params = compat_urllib_parse_urlencode({
1424 'lang': lang,
1425 'v': video_id,
1426 'fmt': ext,
1427 'name': track.attrib['name'].encode('utf-8'),
1428 })
1429 sub_formats.append({
1430 'url': 'https://www.youtube.com/api/timedtext?' + params,
1431 'ext': ext,
1432 })
1433 sub_lang_list[lang] = sub_formats
1434 if has_live_chat_replay:
1435 sub_lang_list['live_chat'] = [
1436 {
1437 'video_id': video_id,
1438 'ext': 'json',
1439 'protocol': 'youtube_live_chat_replay',
1440 },
1441 ]
1442 if not sub_lang_list:
1443 self._downloader.report_warning('video doesn\'t have subtitles')
1444 return {}
1445 return sub_lang_list
1446
1447 def _get_ytplayer_config(self, video_id, webpage):
1448 patterns = (
1449 # User data may contain arbitrary character sequences that may affect
1450 # JSON extraction with regex, e.g. when '};' is contained the second
1451 # regex won't capture the whole JSON. Yet working around by trying more
1452 # concrete regex first keeping in mind proper quoted string handling
1453 # to be implemented in future that will replace this workaround (see
1454 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1455 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1456 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1457 r';ytplayer\.config\s*=\s*({.+?});',
1458 r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
1459 )
1460 config = self._search_regex(
1461 patterns, webpage, 'ytplayer.config', default=None)
1462 if config:
1463 return self._parse_json(
1464 uppercase_escape(config), video_id, fatal=False)
1465
1466 def _get_music_metadata_from_yt_initial(self, yt_initial):
1467 music_metadata = []
1468 key_map = {
1469 'Album': 'album',
1470 'Artist': 'artist',
1471 'Song': 'track'
1472 }
1473 contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
1474 if type(contents) is list:
1475 for content in contents:
1476 music_track = {}
1477 if type(content) is not dict:
1478 continue
1479 videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
1480 if type(videoSecondaryInfoRenderer) is not dict:
1481 continue
1482 rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
1483 if type(rows) is not list:
1484 continue
1485 for row in rows:
1486 metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
1487 if type(metadataRowRenderer) is not dict:
1488 continue
1489 key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
1490 value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
1491 try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
1492 if type(key) is not str or type(value) is not str:
1493 continue
1494 if key in key_map:
1495 if key_map[key] in music_track:
1496 # we've started on a new track
1497 music_metadata.append(music_track)
1498 music_track = {}
1499 music_track[key_map[key]] = value
1500 if len(music_track.keys()):
1501 music_metadata.append(music_track)
1502 return music_metadata
1503
1504 def _get_automatic_captions(self, video_id, webpage):
1505 """We need the webpage for getting the captions url, pass it as an
1506 argument to speed up the process."""
1507 self.to_screen('%s: Looking for automatic captions' % video_id)
1508 player_config = self._get_ytplayer_config(video_id, webpage)
1509 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1510 if not player_config:
1511 self._downloader.report_warning(err_msg)
1512 return {}
1513 try:
1514 if "args" in player_config and "ttsurl" in player_config["args"]:
1515 args = player_config['args']
1516 caption_url = args['ttsurl']
1517 timestamp = args['timestamp']
1518
1519 # We get the available subtitles
1520 list_params = compat_urllib_parse_urlencode({
1521 'type': 'list',
1522 'tlangs': 1,
1523 'asrs': 1,
1524 })
1525 list_url = caption_url + '&' + list_params
1526 caption_list = self._download_xml(list_url, video_id)
1527 original_lang_node = caption_list.find('track')
1528 if original_lang_node is None:
1529 self._downloader.report_warning('Video doesn\'t have automatic captions')
1530 return {}
1531 original_lang = original_lang_node.attrib['lang_code']
1532 caption_kind = original_lang_node.attrib.get('kind', '')
1533
1534 sub_lang_list = {}
1535 for lang_node in caption_list.findall('target'):
1536 sub_lang = lang_node.attrib['lang_code']
1537 sub_formats = []
1538 for ext in self._SUBTITLE_FORMATS:
1539 params = compat_urllib_parse_urlencode({
1540 'lang': original_lang,
1541 'tlang': sub_lang,
1542 'fmt': ext,
1543 'ts': timestamp,
1544 'kind': caption_kind,
1545 })
1546 sub_formats.append({
1547 'url': caption_url + '&' + params,
1548 'ext': ext,
1549 })
1550 sub_lang_list[sub_lang] = sub_formats
1551 return sub_lang_list
1552
1553 def make_captions(sub_url, sub_langs):
1554 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1555 caption_qs = compat_parse_qs(parsed_sub_url.query)
1556 captions = {}
1557 for sub_lang in sub_langs:
1558 sub_formats = []
1559 for ext in self._SUBTITLE_FORMATS:
1560 caption_qs.update({
1561 'tlang': [sub_lang],
1562 'fmt': [ext],
1563 })
1564 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1565 query=compat_urllib_parse_urlencode(caption_qs, True)))
1566 sub_formats.append({
1567 'url': sub_url,
1568 'ext': ext,
1569 })
1570 captions[sub_lang] = sub_formats
1571 return captions
1572
1573 # New captions format as of 22.06.2017
1574 if "args" in player_config:
1575 player_response = player_config["args"].get('player_response')
1576 else:
1577 # New player system (ytInitialPlayerResponse) as of October 2020
1578 player_response = player_config
1579
1580 if player_response:
1581 if isinstance(player_response, compat_str):
1582 player_response = self._parse_json(
1583 player_response, video_id, fatal=False)
1584
1585 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1586 caption_tracks = renderer['captionTracks']
1587 for caption_track in caption_tracks:
1588 if 'kind' not in caption_track:
1589 # not an automatic transcription
1590 continue
1591 base_url = caption_track['baseUrl']
1592 sub_lang_list = []
1593 for lang in renderer['translationLanguages']:
1594 lang_code = lang.get('languageCode')
1595 if lang_code:
1596 sub_lang_list.append(lang_code)
1597 return make_captions(base_url, sub_lang_list)
1598
1599 self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
1600 return {}
1601
1602 if "args" in player_config:
1603 args = player_config["args"]
1604
1605 # Some videos don't provide ttsurl but rather caption_tracks and
1606 # caption_translation_languages (e.g. 20LmZk1hakA)
1607 # Does not used anymore as of 22.06.2017
1608 caption_tracks = args['caption_tracks']
1609 caption_translation_languages = args['caption_translation_languages']
1610 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1611 sub_lang_list = []
1612 for lang in caption_translation_languages.split(','):
1613 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1614 sub_lang = lang_qs.get('lc', [None])[0]
1615 if sub_lang:
1616 sub_lang_list.append(sub_lang)
1617 return make_captions(caption_url, sub_lang_list)
1618 # An extractor error can be raise by the download process if there are
1619 # no automatic captions but there are subtitles
1620 except (KeyError, IndexError, ExtractorError):
1621 self._downloader.report_warning(err_msg)
1622 return {}
1623
1624 def _mark_watched(self, video_id, video_info, player_response):
1625 playback_url = url_or_none(try_get(
1626 player_response,
1627 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1628 video_info, lambda x: x['videostats_playback_base_url'][0]))
1629 if not playback_url:
1630 return
1631 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1632 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1633
1634 # cpn generation algorithm is reverse engineered from base.js.
1635 # In fact it works even with dummy cpn.
1636 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1637 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1638
1639 qs.update({
1640 'ver': ['2'],
1641 'cpn': [cpn],
1642 })
1643 playback_url = compat_urlparse.urlunparse(
1644 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1645
1646 self._download_webpage(
1647 playback_url, video_id, 'Marking watched',
1648 'Unable to mark watched', fatal=False)
1649
1650 @staticmethod
1651 def _extract_urls(webpage):
1652 # Embedded YouTube player
1653 entries = [
1654 unescapeHTML(mobj.group('url'))
1655 for mobj in re.finditer(r'''(?x)
1656 (?:
1657 <iframe[^>]+?src=|
1658 data-video-url=|
1659 <embed[^>]+?src=|
1660 embedSWF\(?:\s*|
1661 <object[^>]+data=|
1662 new\s+SWFObject\(
1663 )
1664 (["\'])
1665 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1666 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1667 \1''', webpage)]
1668
1669 # lazyYT YouTube embed
1670 entries.extend(list(map(
1671 unescapeHTML,
1672 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1673
1674 # Wordpress "YouTube Video Importer" plugin
1675 matches = re.findall(r'''(?x)<div[^>]+
1676 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1677 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1678 entries.extend(m[-1] for m in matches)
1679
1680 return entries
1681
1682 @staticmethod
1683 def _extract_url(webpage):
1684 urls = YoutubeIE._extract_urls(webpage)
1685 return urls[0] if urls else None
1686
1687 @classmethod
1688 def extract_id(cls, url):
1689 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1690 if mobj is None:
1691 raise ExtractorError('Invalid URL: %s' % url)
1692 video_id = mobj.group(2)
1693 return video_id
1694
1695 def _extract_chapters_from_json(self, webpage, video_id, duration):
1696 if not webpage:
1697 return
1698 initial_data = self._parse_json(
1699 self._search_regex(
1700 r'window\["ytInitialData"\] = (.+);\n', webpage,
1701 'player args', default='{}'),
1702 video_id, fatal=False)
1703 if not initial_data or not isinstance(initial_data, dict):
1704 return
1705 chapters_list = try_get(
1706 initial_data,
1707 lambda x: x['playerOverlays']
1708 ['playerOverlayRenderer']
1709 ['decoratedPlayerBarRenderer']
1710 ['decoratedPlayerBarRenderer']
1711 ['playerBar']
1712 ['chapteredPlayerBarRenderer']
1713 ['chapters'],
1714 list)
1715 if not chapters_list:
1716 return
1717
1718 def chapter_time(chapter):
1719 return float_or_none(
1720 try_get(
1721 chapter,
1722 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1723 int),
1724 scale=1000)
1725 chapters = []
1726 for next_num, chapter in enumerate(chapters_list, start=1):
1727 start_time = chapter_time(chapter)
1728 if start_time is None:
1729 continue
1730 end_time = (chapter_time(chapters_list[next_num])
1731 if next_num < len(chapters_list) else duration)
1732 if end_time is None:
1733 continue
1734 title = try_get(
1735 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1736 compat_str)
1737 chapters.append({
1738 'start_time': start_time,
1739 'end_time': end_time,
1740 'title': title,
1741 })
1742 return chapters
1743
1744 @staticmethod
1745 def _extract_chapters_from_description(description, duration):
1746 if not description:
1747 return None
1748 chapter_lines = re.findall(
1749 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1750 description)
1751 if not chapter_lines:
1752 return None
1753 chapters = []
1754 for next_num, (chapter_line, time_point) in enumerate(
1755 chapter_lines, start=1):
1756 start_time = parse_duration(time_point)
1757 if start_time is None:
1758 continue
1759 if start_time > duration:
1760 break
1761 end_time = (duration if next_num == len(chapter_lines)
1762 else parse_duration(chapter_lines[next_num][1]))
1763 if end_time is None:
1764 continue
1765 if end_time > duration:
1766 end_time = duration
1767 if start_time > end_time:
1768 break
1769 chapter_title = re.sub(
1770 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1771 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1772 chapters.append({
1773 'start_time': start_time,
1774 'end_time': end_time,
1775 'title': chapter_title,
1776 })
1777 return chapters
1778
1779 def _extract_chapters(self, webpage, description, video_id, duration):
1780 return (self._extract_chapters_from_json(webpage, video_id, duration)
1781 or self._extract_chapters_from_description(description, duration))
1782
1783 def _real_extract(self, url):
1784 url, smuggled_data = unsmuggle_url(url, {})
1785
1786 proto = (
1787 'http' if self._downloader.params.get('prefer_insecure', False)
1788 else 'https')
1789
1790 start_time = None
1791 end_time = None
1792 parsed_url = compat_urllib_parse_urlparse(url)
1793 for component in [parsed_url.fragment, parsed_url.query]:
1794 query = compat_parse_qs(component)
1795 if start_time is None and 't' in query:
1796 start_time = parse_duration(query['t'][0])
1797 if start_time is None and 'start' in query:
1798 start_time = parse_duration(query['start'][0])
1799 if end_time is None and 'end' in query:
1800 end_time = parse_duration(query['end'][0])
1801
1802 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1803 mobj = re.search(self._NEXT_URL_RE, url)
1804 if mobj:
1805 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1806 video_id = self.extract_id(url)
1807
1808 # Get video webpage
1809 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1810 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1811
1812 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1813 video_id = qs.get('v', [None])[0] or video_id
1814
1815 # Attempt to extract SWF player URL
1816 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1817 if mobj is not None:
1818 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1819 else:
1820 player_url = None
1821
1822 dash_mpds = []
1823
1824 def add_dash_mpd(video_info):
1825 dash_mpd = video_info.get('dashmpd')
1826 if dash_mpd and dash_mpd[0] not in dash_mpds:
1827 dash_mpds.append(dash_mpd[0])
1828
1829 def add_dash_mpd_pr(pl_response):
1830 dash_mpd = url_or_none(try_get(
1831 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1832 compat_str))
1833 if dash_mpd and dash_mpd not in dash_mpds:
1834 dash_mpds.append(dash_mpd)
1835
1836 is_live = None
1837 view_count = None
1838
1839 def extract_view_count(v_info):
1840 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1841
1842 def extract_player_response(player_response, video_id):
1843 pl_response = str_or_none(player_response)
1844 if not pl_response:
1845 return
1846 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1847 if isinstance(pl_response, dict):
1848 add_dash_mpd_pr(pl_response)
1849 return pl_response
1850
1851 def extract_embedded_config(embed_webpage, video_id):
1852 embedded_config = self._search_regex(
1853 r'setConfig\(({.*})\);',
1854 embed_webpage, 'ytInitialData', default=None)
1855 if embedded_config:
1856 return embedded_config
1857
1858 player_response = {}
1859
1860 # Get video info
1861 video_info = {}
1862 embed_webpage = None
1863 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1864 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1865 cookie_keys = self._get_cookies('https://www.youtube.com').keys()
1866 age_gate = True
1867 # We simulate the access to the video from www.youtube.com/v/{video_id}
1868 # this can be viewed without login into Youtube
1869 url = proto + '://www.youtube.com/embed/%s' % video_id
1870 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1871 ext = extract_embedded_config(embed_webpage, video_id)
1872 # playabilityStatus = re.search(r'{\\\"status\\\":\\\"(?P<playabilityStatus>[^\"]+)\\\"', ext)
1873 playable_in_embed = re.search(r'{\\\"playableInEmbed\\\":(?P<playableinEmbed>[^\,]+)', ext)
1874 if not playable_in_embed:
1875 self.to_screen('Could not determine whether playabale in embed for video %s' % video_id)
1876 playable_in_embed = ''
1877 else:
1878 playable_in_embed = playable_in_embed.group('playableinEmbed')
1879 # check if video is only playable on youtube in other words not playable in embed - if so it requires auth (cookies)
1880 # if re.search(r'player-unavailable">', embed_webpage) is not None:
1881 if playable_in_embed == 'false':
1882 '''
1883 # TODO apply this patch when Support for Python 2.6(!) and above drops
1884 if ({'VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID'} <= cookie_keys
1885 or {'VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO'} <= cookie_keys):
1886 '''
1887 if (set(('VISITOR_INFO1_LIVE', 'HSID', 'SSID', 'SID')) <= set(cookie_keys)
1888 or set(('VISITOR_INFO1_LIVE', '__Secure-3PSID', 'LOGIN_INFO')) <= set(cookie_keys)):
1889 age_gate = False
1890 # Try looking directly into the video webpage
1891 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1892 if ytplayer_config:
1893 args = ytplayer_config.get("args")
1894 if args is not None:
1895 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1896 # Convert to the same format returned by compat_parse_qs
1897 video_info = dict((k, [v]) for k, v in args.items())
1898 add_dash_mpd(video_info)
1899 # Rental video is not rented but preview is available (e.g.
1900 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1901 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1902 if not video_info and args.get('ypc_vid'):
1903 return self.url_result(
1904 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1905 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1906 is_live = True
1907 if not player_response:
1908 player_response = extract_player_response(args.get('player_response'), video_id)
1909 elif not player_response:
1910 player_response = ytplayer_config
1911 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1912 add_dash_mpd_pr(player_response)
1913 else:
1914 raise ExtractorError('Video is age restricted and only playable on Youtube. Requires cookies!', expected=True)
1915 else:
1916 data = compat_urllib_parse_urlencode({
1917 'video_id': video_id,
1918 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1919 'sts': self._search_regex(
1920 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1921 })
1922 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1923 try:
1924 video_info_webpage = self._download_webpage(
1925 video_info_url, video_id,
1926 note='Refetching age-gated info webpage',
1927 errnote='unable to download video info webpage')
1928 except ExtractorError:
1929 video_info_webpage = None
1930 if video_info_webpage:
1931 video_info = compat_parse_qs(video_info_webpage)
1932 pl_response = video_info.get('player_response', [None])[0]
1933 player_response = extract_player_response(pl_response, video_id)
1934 add_dash_mpd(video_info)
1935 view_count = extract_view_count(video_info)
1936 else:
1937 age_gate = False
1938 # Try looking directly into the video webpage
1939 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1940 args = ytplayer_config.get("args")
1941 if args is not None:
1942 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1943 # Convert to the same format returned by compat_parse_qs
1944 video_info = dict((k, [v]) for k, v in args.items())
1945 add_dash_mpd(video_info)
1946 # Rental video is not rented but preview is available (e.g.
1947 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1948 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1949 if not video_info and args.get('ypc_vid'):
1950 return self.url_result(
1951 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1952 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1953 is_live = True
1954 if not player_response:
1955 player_response = extract_player_response(args.get('player_response'), video_id)
1956 elif not player_response:
1957 player_response = ytplayer_config
1958 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1959 add_dash_mpd_pr(player_response)
1960
1961 def extract_unavailable_message():
1962 messages = []
1963 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1964 msg = self._html_search_regex(
1965 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1966 video_webpage, 'unavailable %s' % kind, default=None)
1967 if msg:
1968 messages.append(msg)
1969 if messages:
1970 return '\n'.join(messages)
1971
1972 if not video_info and not player_response:
1973 unavailable_message = extract_unavailable_message()
1974 if not unavailable_message:
1975 unavailable_message = 'Unable to extract video data'
1976 raise ExtractorError(
1977 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1978
1979 if not isinstance(video_info, dict):
1980 video_info = {}
1981
1982 video_details = try_get(
1983 player_response, lambda x: x['videoDetails'], dict) or {}
1984
1985 microformat = try_get(
1986 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1987
1988 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1989 if not video_title:
1990 self._downloader.report_warning('Unable to extract video title')
1991 video_title = '_'
1992
1993 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1994 if video_description:
1995
1996 def replace_url(m):
1997 redir_url = compat_urlparse.urljoin(url, m.group(1))
1998 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1999 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
2000 qs = compat_parse_qs(parsed_redir_url.query)
2001 q = qs.get('q')
2002 if q and q[0]:
2003 return q[0]
2004 return redir_url
2005
2006 description_original = video_description = re.sub(r'''(?x)
2007 <a\s+
2008 (?:[a-zA-Z-]+="[^"]*"\s+)*?
2009 (?:title|href)="([^"]+)"\s+
2010 (?:[a-zA-Z-]+="[^"]*"\s+)*?
2011 class="[^"]*"[^>]*>
2012 [^<]+\.{3}\s*
2013 </a>
2014 ''', replace_url, video_description)
2015 video_description = clean_html(video_description)
2016 else:
2017 video_description = video_details.get('shortDescription')
2018 if video_description is None:
2019 video_description = self._html_search_meta('description', video_webpage)
2020
2021 if not smuggled_data.get('force_singlefeed', False):
2022 if not self._downloader.params.get('noplaylist'):
2023 multifeed_metadata_list = try_get(
2024 player_response,
2025 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
2026 compat_str) or try_get(
2027 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
2028 if multifeed_metadata_list:
2029 entries = []
2030 feed_ids = []
2031 for feed in multifeed_metadata_list.split(','):
2032 # Unquote should take place before split on comma (,) since textual
2033 # fields may contain comma as well (see
2034 # https://github.com/ytdl-org/youtube-dl/issues/8536)
2035 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
2036
2037 def feed_entry(name):
2038 return try_get(feed_data, lambda x: x[name][0], compat_str)
2039
2040 feed_id = feed_entry('id')
2041 if not feed_id:
2042 continue
2043 feed_title = feed_entry('title')
2044 title = video_title
2045 if feed_title:
2046 title += ' (%s)' % feed_title
2047 entries.append({
2048 '_type': 'url_transparent',
2049 'ie_key': 'Youtube',
2050 'url': smuggle_url(
2051 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
2052 {'force_singlefeed': True}),
2053 'title': title,
2054 })
2055 feed_ids.append(feed_id)
2056 self.to_screen(
2057 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
2058 % (', '.join(feed_ids), video_id))
2059 return self.playlist_result(entries, video_id, video_title, video_description)
2060 else:
2061 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2062
2063 if view_count is None:
2064 view_count = extract_view_count(video_info)
2065 if view_count is None and video_details:
2066 view_count = int_or_none(video_details.get('viewCount'))
2067 if view_count is None and microformat:
2068 view_count = int_or_none(microformat.get('viewCount'))
2069
2070 if is_live is None:
2071 is_live = bool_or_none(video_details.get('isLive'))
2072
2073 has_live_chat_replay = False
2074 if not is_live:
2075 yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
2076 try:
2077 yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
2078 has_live_chat_replay = True
2079 except (KeyError, IndexError, TypeError):
2080 pass
2081
2082 # Check for "rental" videos
2083 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2084 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
2085
2086 def _extract_filesize(media_url):
2087 return int_or_none(self._search_regex(
2088 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2089
2090 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2091 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2092
2093 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2094 self.report_rtmp_download()
2095 formats = [{
2096 'format_id': '_rtmp',
2097 'protocol': 'rtmp',
2098 'url': video_info['conn'][0],
2099 'player_url': player_url,
2100 }]
2101 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
2102 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
2103 if 'rtmpe%3Dyes' in encoded_url_map:
2104 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
2105 formats = []
2106 formats_spec = {}
2107 fmt_list = video_info.get('fmt_list', [''])[0]
2108 if fmt_list:
2109 for fmt in fmt_list.split(','):
2110 spec = fmt.split('/')
2111 if len(spec) > 1:
2112 width_height = spec[1].split('x')
2113 if len(width_height) == 2:
2114 formats_spec[spec[0]] = {
2115 'resolution': spec[1],
2116 'width': int_or_none(width_height[0]),
2117 'height': int_or_none(width_height[1]),
2118 }
2119 for fmt in streaming_formats:
2120 itag = str_or_none(fmt.get('itag'))
2121 if not itag:
2122 continue
2123 quality = fmt.get('quality')
2124 quality_label = fmt.get('qualityLabel') or quality
2125 formats_spec[itag] = {
2126 'asr': int_or_none(fmt.get('audioSampleRate')),
2127 'filesize': int_or_none(fmt.get('contentLength')),
2128 'format_note': quality_label,
2129 'fps': int_or_none(fmt.get('fps')),
2130 'height': int_or_none(fmt.get('height')),
2131 # bitrate for itag 43 is always 2147483647
2132 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2133 'width': int_or_none(fmt.get('width')),
2134 }
2135
2136 for fmt in streaming_formats:
2137 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2138 continue
2139 url = url_or_none(fmt.get('url'))
2140
2141 if not url:
2142 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2143 if not cipher:
2144 continue
2145 url_data = compat_parse_qs(cipher)
2146 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2147 if not url:
2148 continue
2149 else:
2150 cipher = None
2151 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2152
2153 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2154 # Unsupported FORMAT_STREAM_TYPE_OTF
2155 if stream_type == 3:
2156 continue
2157
2158 format_id = fmt.get('itag') or url_data['itag'][0]
2159 if not format_id:
2160 continue
2161 format_id = compat_str(format_id)
2162
2163 if cipher:
2164 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2165 ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
2166 jsplayer_url_json = self._search_regex(
2167 ASSETS_RE,
2168 embed_webpage if age_gate else video_webpage,
2169 'JS player URL (1)', default=None)
2170 if not jsplayer_url_json and not age_gate:
2171 # We need the embed website after all
2172 if embed_webpage is None:
2173 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2174 embed_webpage = self._download_webpage(
2175 embed_url, video_id, 'Downloading embed webpage')
2176 jsplayer_url_json = self._search_regex(
2177 ASSETS_RE, embed_webpage, 'JS player URL')
2178
2179 player_url = json.loads(jsplayer_url_json)
2180 if player_url is None:
2181 player_url_json = self._search_regex(
2182 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2183 video_webpage, 'age gate player URL')
2184 player_url = json.loads(player_url_json)
2185
2186 if 'sig' in url_data:
2187 url += '&signature=' + url_data['sig'][0]
2188 elif 's' in url_data:
2189 encrypted_sig = url_data['s'][0]
2190
2191 if self._downloader.params.get('verbose'):
2192 if player_url is None:
2193 player_desc = 'unknown'
2194 else:
2195 player_type, player_version = self._extract_player_info(player_url)
2196 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2197 parts_sizes = self._signature_cache_id(encrypted_sig)
2198 self.to_screen('{%s} signature length %s, %s' %
2199 (format_id, parts_sizes, player_desc))
2200
2201 signature = self._decrypt_signature(
2202 encrypted_sig, video_id, player_url, age_gate)
2203 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2204 url += '&%s=%s' % (sp, signature)
2205 if 'ratebypass' not in url:
2206 url += '&ratebypass=yes'
2207
2208 dct = {
2209 'format_id': format_id,
2210 'url': url,
2211 'player_url': player_url,
2212 }
2213 if format_id in self._formats:
2214 dct.update(self._formats[format_id])
2215 if format_id in formats_spec:
2216 dct.update(formats_spec[format_id])
2217
2218 # Some itags are not included in DASH manifest thus corresponding formats will
2219 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2220 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2221 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2222 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2223
2224 if width is None:
2225 width = int_or_none(fmt.get('width'))
2226 if height is None:
2227 height = int_or_none(fmt.get('height'))
2228
2229 filesize = int_or_none(url_data.get(
2230 'clen', [None])[0]) or _extract_filesize(url)
2231
2232 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2233 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2234
2235 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2236 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2237 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2238
2239 more_fields = {
2240 'filesize': filesize,
2241 'tbr': tbr,
2242 'width': width,
2243 'height': height,
2244 'fps': fps,
2245 'format_note': quality_label or quality,
2246 }
2247 for key, value in more_fields.items():
2248 if value:
2249 dct[key] = value
2250 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2251 if type_:
2252 type_split = type_.split(';')
2253 kind_ext = type_split[0].split('/')
2254 if len(kind_ext) == 2:
2255 kind, _ = kind_ext
2256 dct['ext'] = mimetype2ext(type_split[0])
2257 if kind in ('audio', 'video'):
2258 codecs = None
2259 for mobj in re.finditer(
2260 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2261 if mobj.group('key') == 'codecs':
2262 codecs = mobj.group('val')
2263 break
2264 if codecs:
2265 dct.update(parse_codecs(codecs))
2266 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2267 dct['downloader_options'] = {
2268 # Youtube throttles chunks >~10M
2269 'http_chunk_size': 10485760,
2270 }
2271 formats.append(dct)
2272 else:
2273 manifest_url = (
2274 url_or_none(try_get(
2275 player_response,
2276 lambda x: x['streamingData']['hlsManifestUrl'],
2277 compat_str))
2278 or url_or_none(try_get(
2279 video_info, lambda x: x['hlsvp'][0], compat_str)))
2280 if manifest_url:
2281 formats = []
2282 m3u8_formats = self._extract_m3u8_formats(
2283 manifest_url, video_id, 'mp4', fatal=False)
2284 for a_format in m3u8_formats:
2285 itag = self._search_regex(
2286 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2287 if itag:
2288 a_format['format_id'] = itag
2289 if itag in self._formats:
2290 dct = self._formats[itag].copy()
2291 dct.update(a_format)
2292 a_format = dct
2293 a_format['player_url'] = player_url
2294 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2295 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2296 if self._downloader.params.get('youtube_include_hls_manifest', True):
2297 formats.append(a_format)
2298 else:
2299 error_message = extract_unavailable_message()
2300 if not error_message:
2301 error_message = clean_html(try_get(
2302 player_response, lambda x: x['playabilityStatus']['reason'],
2303 compat_str))
2304 if not error_message:
2305 error_message = clean_html(
2306 try_get(video_info, lambda x: x['reason'][0], compat_str))
2307 if error_message:
2308 raise ExtractorError(error_message, expected=True)
2309 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2310
2311 # uploader
2312 video_uploader = try_get(
2313 video_info, lambda x: x['author'][0],
2314 compat_str) or str_or_none(video_details.get('author'))
2315 if video_uploader:
2316 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2317 else:
2318 self._downloader.report_warning('unable to extract uploader name')
2319
2320 # uploader_id
2321 video_uploader_id = None
2322 video_uploader_url = None
2323 mobj = re.search(
2324 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2325 video_webpage)
2326 if mobj is not None:
2327 video_uploader_id = mobj.group('uploader_id')
2328 video_uploader_url = mobj.group('uploader_url')
2329 else:
2330 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2331 if owner_profile_url:
2332 video_uploader_id = self._search_regex(
2333 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2334 default=None)
2335 video_uploader_url = owner_profile_url
2336
2337 channel_id = (
2338 str_or_none(video_details.get('channelId'))
2339 or self._html_search_meta(
2340 'channelId', video_webpage, 'channel id', default=None)
2341 or self._search_regex(
2342 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2343 video_webpage, 'channel id', default=None, group='id'))
2344 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2345
2346 thumbnails = []
2347 thumbnails_list = try_get(
2348 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2349 for t in thumbnails_list:
2350 if not isinstance(t, dict):
2351 continue
2352 thumbnail_url = url_or_none(t.get('url'))
2353 if not thumbnail_url:
2354 continue
2355 thumbnails.append({
2356 'url': thumbnail_url,
2357 'width': int_or_none(t.get('width')),
2358 'height': int_or_none(t.get('height')),
2359 })
2360
2361 if not thumbnails:
2362 video_thumbnail = None
2363 # We try first to get a high quality image:
2364 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2365 video_webpage, re.DOTALL)
2366 if m_thumb is not None:
2367 video_thumbnail = m_thumb.group(1)
2368 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2369 if thumbnail_url:
2370 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2371 if video_thumbnail:
2372 thumbnails.append({'url': video_thumbnail})
2373
2374 # upload date
2375 upload_date = self._html_search_meta(
2376 'datePublished', video_webpage, 'upload date', default=None)
2377 if not upload_date:
2378 upload_date = self._search_regex(
2379 [r'(?s)id="eow-date.*?>(.*?)</span>',
2380 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2381 video_webpage, 'upload date', default=None)
2382 if not upload_date:
2383 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2384 upload_date = unified_strdate(upload_date)
2385
2386 video_license = self._html_search_regex(
2387 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2388 video_webpage, 'license', default=None)
2389
2390 m_music = re.search(
2391 r'''(?x)
2392 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2393 <ul[^>]*>\s*
2394 <li>(?P<title>.+?)
2395 by (?P<creator>.+?)
2396 (?:
2397 \(.+?\)|
2398 <a[^>]*
2399 (?:
2400 \bhref=["\']/red[^>]*>| # drop possible
2401 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2402 )
2403 .*?
2404 )?</li
2405 ''',
2406 video_webpage)
2407 if m_music:
2408 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2409 video_creator = clean_html(m_music.group('creator'))
2410 else:
2411 video_alt_title = video_creator = None
2412
2413 def extract_meta(field):
2414 return self._html_search_regex(
2415 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2416 video_webpage, field, default=None)
2417
2418 track = extract_meta('Song')
2419 artist = extract_meta('Artist')
2420 album = extract_meta('Album')
2421
2422 # Youtube Music Auto-generated description
2423 release_date = release_year = None
2424 if video_description:
2425 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2426 if mobj:
2427 if not track:
2428 track = mobj.group('track').strip()
2429 if not artist:
2430 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2431 if not album:
2432 album = mobj.group('album'.strip())
2433 release_year = mobj.group('release_year')
2434 release_date = mobj.group('release_date')
2435 if release_date:
2436 release_date = release_date.replace('-', '')
2437 if not release_year:
2438 release_year = int(release_date[:4])
2439 if release_year:
2440 release_year = int(release_year)
2441
2442 yt_initial = self._get_yt_initial_data(video_id, video_webpage)
2443 if yt_initial:
2444 music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
2445 if len(music_metadata):
2446 album = music_metadata[0].get('album')
2447 artist = music_metadata[0].get('artist')
2448 track = music_metadata[0].get('track')
2449
2450 m_episode = re.search(
2451 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2452 video_webpage)
2453 if m_episode:
2454 series = unescapeHTML(m_episode.group('series'))
2455 season_number = int(m_episode.group('season'))
2456 episode_number = int(m_episode.group('episode'))
2457 else:
2458 series = season_number = episode_number = None
2459
2460 m_cat_container = self._search_regex(
2461 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2462 video_webpage, 'categories', default=None)
2463 category = None
2464 if m_cat_container:
2465 category = self._html_search_regex(
2466 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2467 default=None)
2468 if not category:
2469 category = try_get(
2470 microformat, lambda x: x['category'], compat_str)
2471 video_categories = None if category is None else [category]
2472
2473 video_tags = [
2474 unescapeHTML(m.group('content'))
2475 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2476 if not video_tags:
2477 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2478
2479 def _extract_count(count_name):
2480 return str_to_int(self._search_regex(
2481 r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
2482 % re.escape(count_name),
2483 video_webpage, count_name, default=None))
2484
2485 like_count = _extract_count('like')
2486 dislike_count = _extract_count('dislike')
2487
2488 if view_count is None:
2489 view_count = str_to_int(self._search_regex(
2490 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2491 'view count', default=None))
2492
2493 average_rating = (
2494 float_or_none(video_details.get('averageRating'))
2495 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2496
2497 # subtitles
2498 video_subtitles = self.extract_subtitles(
2499 video_id, video_webpage, has_live_chat_replay)
2500 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2501
2502 video_duration = try_get(
2503 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2504 if not video_duration:
2505 video_duration = int_or_none(video_details.get('lengthSeconds'))
2506 if not video_duration:
2507 video_duration = parse_duration(self._html_search_meta(
2508 'duration', video_webpage, 'video duration'))
2509
2510 # Get Subscriber Count of channel
2511 subscriber_count = parse_count(self._search_regex(
2512 r'"text":"([\d\.]+\w?) subscribers"',
2513 video_webpage,
2514 'subscriber count',
2515 default=None
2516 ))
2517
2518 # annotations
2519 video_annotations = None
2520 if self._downloader.params.get('writeannotations', False):
2521 xsrf_token = self._search_regex(
2522 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2523 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2524 invideo_url = try_get(
2525 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2526 if xsrf_token and invideo_url:
2527 xsrf_field_name = self._search_regex(
2528 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2529 video_webpage, 'xsrf field name',
2530 group='xsrf_field_name', default='session_token')
2531 video_annotations = self._download_webpage(
2532 self._proto_relative_url(invideo_url),
2533 video_id, note='Downloading annotations',
2534 errnote='Unable to download video annotations', fatal=False,
2535 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2536
2537 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2538
2539 # Look for the DASH manifest
2540 if self._downloader.params.get('youtube_include_dash_manifest', True):
2541 dash_mpd_fatal = True
2542 for mpd_url in dash_mpds:
2543 dash_formats = {}
2544 try:
2545 def decrypt_sig(mobj):
2546 s = mobj.group(1)
2547 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2548 return '/signature/%s' % dec_s
2549
2550 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2551
2552 for df in self._extract_mpd_formats(
2553 mpd_url, video_id, fatal=dash_mpd_fatal,
2554 formats_dict=self._formats):
2555 if not df.get('filesize'):
2556 df['filesize'] = _extract_filesize(df['url'])
2557 # Do not overwrite DASH format found in some previous DASH manifest
2558 if df['format_id'] not in dash_formats:
2559 dash_formats[df['format_id']] = df
2560 # Additional DASH manifests may end up in HTTP Error 403 therefore
2561 # allow them to fail without bug report message if we already have
2562 # some DASH manifest succeeded. This is temporary workaround to reduce
2563 # burst of bug reports until we figure out the reason and whether it
2564 # can be fixed at all.
2565 dash_mpd_fatal = False
2566 except (ExtractorError, KeyError) as e:
2567 self.report_warning(
2568 'Skipping DASH manifest: %r' % e, video_id)
2569 if dash_formats:
2570 # Remove the formats we found through non-DASH, they
2571 # contain less info and it can be wrong, because we use
2572 # fixed values (for example the resolution). See
2573 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2574 # example.
2575 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2576 formats.extend(dash_formats.values())
2577
2578 # Check for malformed aspect ratio
2579 stretched_m = re.search(
2580 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2581 video_webpage)
2582 if stretched_m:
2583 w = float(stretched_m.group('w'))
2584 h = float(stretched_m.group('h'))
2585 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2586 # We will only process correct ratios.
2587 if w > 0 and h > 0:
2588 ratio = w / h
2589 for f in formats:
2590 if f.get('vcodec') != 'none':
2591 f['stretched_ratio'] = ratio
2592
2593 if not formats:
2594 if 'reason' in video_info:
2595 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2596 regions_allowed = self._html_search_meta(
2597 'regionsAllowed', video_webpage, default=None)
2598 countries = regions_allowed.split(',') if regions_allowed else None
2599 self.raise_geo_restricted(
2600 msg=video_info['reason'][0], countries=countries)
2601 reason = video_info['reason'][0]
2602 if 'Invalid parameters' in reason:
2603 unavailable_message = extract_unavailable_message()
2604 if unavailable_message:
2605 reason = unavailable_message
2606 raise ExtractorError(
2607 'YouTube said: %s' % reason,
2608 expected=True, video_id=video_id)
2609 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2610 raise ExtractorError('This video is DRM protected.', expected=True)
2611
2612 self._sort_formats(formats)
2613
2614 self.mark_watched(video_id, video_info, player_response)
2615
2616 return {
2617 'id': video_id,
2618 'uploader': video_uploader,
2619 'uploader_id': video_uploader_id,
2620 'uploader_url': video_uploader_url,
2621 'channel_id': channel_id,
2622 'channel_url': channel_url,
2623 'upload_date': upload_date,
2624 'license': video_license,
2625 'creator': video_creator or artist,
2626 'title': video_title,
2627 'alt_title': video_alt_title or track,
2628 'thumbnails': thumbnails,
2629 'description': video_description,
2630 'categories': video_categories,
2631 'tags': video_tags,
2632 'subtitles': video_subtitles,
2633 'automatic_captions': automatic_captions,
2634 'duration': video_duration,
2635 'age_limit': 18 if age_gate else 0,
2636 'annotations': video_annotations,
2637 'chapters': chapters,
2638 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2639 'view_count': view_count,
2640 'like_count': like_count,
2641 'dislike_count': dislike_count,
2642 'average_rating': average_rating,
2643 'formats': formats,
2644 'is_live': is_live,
2645 'start_time': start_time,
2646 'end_time': end_time,
2647 'series': series,
2648 'season_number': season_number,
2649 'episode_number': episode_number,
2650 'track': track,
2651 'artist': artist,
2652 'album': album,
2653 'release_date': release_date,
2654 'release_year': release_year,
2655 'subscriber_count': subscriber_count,
2656 }
2657
2658
2659class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2660 IE_DESC = 'YouTube.com playlists'
2661 _VALID_URL = r"""(?x)(?:
2662 (?:https?://)?
2663 (?:\w+\.)?
2664 (?:
2665 (?:
2666 youtube(?:kids)?\.com|
2667 invidio\.us
2668 )
2669 /
2670 (?:
2671 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2672 \? (?:.*?[&;])*? (?:p|a|list)=
2673 | p/
2674 )|
2675 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2676 )
2677 (
2678 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2679 # Top tracks, they can also include dots
2680 |(?:MC)[\w\.]*
2681 )
2682 .*
2683 |
2684 (%(playlist_id)s)
2685 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2686 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2687 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2688 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2689 IE_NAME = 'youtube:playlist'
2690 _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_'
2691 _YTM_CHANNEL_INFO = {
2692 'uploader': 'Youtube Music',
2693 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
2694 'uploader_url': 'https://www.youtube.com/music'
2695 }
2696 _TESTS = [{
2697 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2698 'info_dict': {
2699 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2700 'uploader': 'Sergey M.',
2701 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2702 'title': 'youtube-dl public playlist',
2703 },
2704 'playlist_count': 1,
2705 }, {
2706 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2707 'info_dict': {
2708 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2709 'uploader': 'Sergey M.',
2710 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2711 'title': 'youtube-dl empty playlist',
2712 },
2713 'playlist_count': 0,
2714 }, {
2715 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2716 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2717 'info_dict': {
2718 'title': '29C3: Not my department',
2719 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2720 'uploader': 'Christiaan008',
2721 'uploader_id': 'ChRiStIaAn008',
2722 },
2723 'playlist_count': 96,
2724 }, {
2725 'note': 'issue #673',
2726 'url': 'PLBB231211A4F62143',
2727 'info_dict': {
2728 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2729 'id': 'PLBB231211A4F62143',
2730 'uploader': 'Wickydoo',
2731 'uploader_id': 'Wickydoo',
2732 },
2733 'playlist_mincount': 26,
2734 }, {
2735 'note': 'Large playlist',
2736 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2737 'info_dict': {
2738 'title': 'Uploads from Cauchemar',
2739 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2740 'uploader': 'Cauchemar',
2741 'uploader_id': 'Cauchemar89',
2742 },
2743 'playlist_mincount': 799,
2744 }, {
2745 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2746 'info_dict': {
2747 'title': 'YDL_safe_search',
2748 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2749 },
2750 'playlist_count': 2,
2751 'skip': 'This playlist is private',
2752 }, {
2753 'note': 'embedded',
2754 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2755 'playlist_count': 4,
2756 'info_dict': {
2757 'title': 'JODA15',
2758 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2759 'uploader': 'milan',
2760 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2761 }
2762 }, {
2763 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2764 'playlist_mincount': 485,
2765 'info_dict': {
2766 'title': '2018 Chinese New Singles (11/6 updated)',
2767 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2768 'uploader': 'LBK',
2769 'uploader_id': 'sdragonfang',
2770 }
2771 }, {
2772 'note': 'Embedded SWF player',
2773 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2774 'playlist_count': 4,
2775 'info_dict': {
2776 'title': 'JODA7',
2777 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2778 },
2779 'skip': 'This playlist does not exist',
2780 }, {
2781 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2782 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2783 'info_dict': {
2784 'title': 'Uploads from Interstellar Movie',
2785 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2786 'uploader': 'Interstellar Movie',
2787 'uploader_id': 'InterstellarMovie1',
2788 },
2789 'playlist_mincount': 21,
2790 }, {
2791 # Playlist URL that does not actually serve a playlist
2792 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2793 'info_dict': {
2794 'id': 'FqZTN594JQw',
2795 'ext': 'webm',
2796 'title': "Smiley's People 01 detective, Adventure Series, Action",
2797 'uploader': 'STREEM',
2798 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2799 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2800 'upload_date': '20150526',
2801 'license': 'Standard YouTube License',
2802 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2803 'categories': ['People & Blogs'],
2804 'tags': list,
2805 'view_count': int,
2806 'like_count': int,
2807 'dislike_count': int,
2808 },
2809 'params': {
2810 'skip_download': True,
2811 },
2812 'skip': 'This video is not available.',
2813 'add_ie': [YoutubeIE.ie_key()],
2814 }, {
2815 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2816 'info_dict': {
2817 'id': 'yeWKywCrFtk',
2818 'ext': 'mp4',
2819 'title': 'Small Scale Baler and Braiding Rugs',
2820 'uploader': 'Backus-Page House Museum',
2821 'uploader_id': 'backuspagemuseum',
2822 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2823 'upload_date': '20161008',
2824 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2825 'categories': ['Nonprofits & Activism'],
2826 'tags': list,
2827 'like_count': int,
2828 'dislike_count': int,
2829 },
2830 'params': {
2831 'noplaylist': True,
2832 'skip_download': True,
2833 },
2834 }, {
2835 # https://github.com/ytdl-org/youtube-dl/issues/21844
2836 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2837 'info_dict': {
2838 'title': 'Data Analysis with Dr Mike Pound',
2839 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2840 'uploader_id': 'Computerphile',
2841 'uploader': 'Computerphile',
2842 },
2843 'playlist_mincount': 11,
2844 }, {
2845 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2846 'only_matching': True,
2847 }, {
2848 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2849 'only_matching': True,
2850 }, {
2851 # music album playlist
2852 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2853 'only_matching': True,
2854 }, {
2855 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2856 'only_matching': True,
2857 }, {
2858 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2859 'only_matching': True,
2860 }]
2861
2862 def _real_initialize(self):
2863 self._login()
2864
2865 def extract_videos_from_page(self, page):
2866 ids_in_page = []
2867 titles_in_page = []
2868
2869 for item in re.findall(
2870 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2871 attrs = extract_attributes(item)
2872 video_id = attrs['data-video-id']
2873 video_title = unescapeHTML(attrs.get('data-title'))
2874 if video_title:
2875 video_title = video_title.strip()
2876 ids_in_page.append(video_id)
2877 titles_in_page.append(video_title)
2878
2879 # Fallback with old _VIDEO_RE
2880 self.extract_videos_from_page_impl(
2881 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2882
2883 # Relaxed fallbacks
2884 self.extract_videos_from_page_impl(
2885 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2886 ids_in_page, titles_in_page)
2887 self.extract_videos_from_page_impl(
2888 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2889 ids_in_page, titles_in_page)
2890
2891 return zip(ids_in_page, titles_in_page)
2892
2893 def _extract_mix_ids_from_yt_initial(self, yt_initial):
2894 ids = []
2895 playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
2896 if playlist_contents:
2897 for item in playlist_contents:
2898 videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
2899 if videoId:
2900 ids.append(videoId)
2901 return ids
2902
2903 def _extract_mix(self, playlist_id):
2904 # The mixes are generated from a single video
2905 # the id of the playlist is just 'RD' + video_id
2906 ids = []
2907 yt_initial = None
2908 last_id = playlist_id[-11:]
2909 for n in itertools.count(1):
2910 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2911 webpage = self._download_webpage(
2912 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2913 new_ids = orderedSet(re.findall(
2914 r'''(?xs)data-video-username=".*?".*?
2915 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
2916 webpage))
2917
2918 # if no ids in html of page, try using embedded json
2919 if (len(new_ids) == 0):
2920 yt_initial = self._get_yt_initial_data(playlist_id, webpage)
2921 if yt_initial:
2922 new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
2923
2924 # Fetch new pages until all the videos are repeated, it seems that
2925 # there are always 51 unique videos.
2926 new_ids = [_id for _id in new_ids if _id not in ids]
2927 if not new_ids:
2928 break
2929 ids.extend(new_ids)
2930 last_id = ids[-1]
2931
2932 url_results = self._ids_to_results(ids)
2933
2934 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2935 title_span = (
2936 search_title('playlist-title')
2937 or search_title('title long-title')
2938 or search_title('title'))
2939 title = clean_html(title_span)
2940
2941 if not title:
2942 title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str)
2943
2944 return self.playlist_result(url_results, playlist_id, title)
2945
2946 def _extract_playlist(self, playlist_id):
2947 url = self._TEMPLATE_URL % playlist_id
2948 page = self._download_webpage(url, playlist_id)
2949
2950 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2951 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2952 match = match.strip()
2953 # Check if the playlist exists or is private
2954 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2955 if mobj:
2956 reason = mobj.group('reason')
2957 message = 'This playlist %s' % reason
2958 if 'private' in reason:
2959 message += ', use --username or --netrc to access it'
2960 message += '.'
2961 raise ExtractorError(message, expected=True)
2962 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2963 raise ExtractorError(
2964 'Invalid parameters. Maybe URL is incorrect.',
2965 expected=True)
2966 elif re.match(r'[^<]*Choose your language[^<]*', match):
2967 continue
2968 else:
2969 self.report_warning('Youtube gives an alert message: ' + match)
2970
2971 playlist_title = self._html_search_regex(
2972 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2973 page, 'title', default=None)
2974
2975 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2976 uploader = self._html_search_regex(
2977 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2978 page, 'uploader', default=None)
2979 mobj = re.search(
2980 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2981 page)
2982 if mobj:
2983 uploader_id = mobj.group('uploader_id')
2984 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2985 else:
2986 uploader_id = uploader_url = None
2987
2988 has_videos = True
2989
2990 if not playlist_title:
2991 try:
2992 # Some playlist URLs don't actually serve a playlist (e.g.
2993 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2994 next(self._entries(page, playlist_id))
2995 except StopIteration:
2996 has_videos = False
2997
2998 playlist = self.playlist_result(
2999 self._entries(page, playlist_id), playlist_id, playlist_title)
3000 playlist.update({
3001 'uploader': uploader,
3002 'uploader_id': uploader_id,
3003 'uploader_url': uploader_url,
3004 })
3005 if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
3006 playlist.update(self._YTM_CHANNEL_INFO)
3007
3008 return has_videos, playlist
3009
3010 def _check_download_just_video(self, url, playlist_id):
3011 # Check if it's a video-specific URL
3012 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3013 video_id = query_dict.get('v', [None])[0] or self._search_regex(
3014 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
3015 'video id', default=None)
3016 if video_id:
3017 if self._downloader.params.get('noplaylist'):
3018 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3019 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
3020 else:
3021 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3022 return video_id, None
3023 return None, None
3024
3025 def _real_extract(self, url):
3026 # Extract playlist id
3027 mobj = re.match(self._VALID_URL, url)
3028 if mobj is None:
3029 raise ExtractorError('Invalid URL: %s' % url)
3030 playlist_id = mobj.group(1) or mobj.group(2)
3031
3032 video_id, video = self._check_download_just_video(url, playlist_id)
3033 if video:
3034 return video
3035
3036 if playlist_id.startswith(('RD', 'UL', 'PU')):
3037 if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
3038 # Mixes require a custom extraction process,
3039 # Youtube Music playlists act like normal playlists (with randomized order)
3040 return self._extract_mix(playlist_id)
3041
3042 has_videos, playlist = self._extract_playlist(playlist_id)
3043 if has_videos or not video_id:
3044 return playlist
3045
3046 # Some playlist URLs don't actually serve a playlist (see
3047 # https://github.com/ytdl-org/youtube-dl/issues/10537).
3048 # Fallback to plain video extraction if there is a video id
3049 # along with playlist id.
3050 return self.url_result(video_id, 'Youtube', video_id=video_id)
3051
3052
3053class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
3054 IE_DESC = 'YouTube.com channels'
3055 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
3056 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
3057 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
3058 IE_NAME = 'youtube:channel'
3059 _TESTS = [{
3060 'note': 'paginated channel',
3061 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
3062 'playlist_mincount': 91,
3063 'info_dict': {
3064 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
3065 'title': 'Uploads from lex will',
3066 'uploader': 'lex will',
3067 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
3068 }
3069 }, {
3070 'note': 'Age restricted channel',
3071 # from https://www.youtube.com/user/DeusExOfficial
3072 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
3073 'playlist_mincount': 64,
3074 'info_dict': {
3075 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
3076 'title': 'Uploads from Deus Ex',
3077 'uploader': 'Deus Ex',
3078 'uploader_id': 'DeusExOfficial',
3079 },
3080 }, {
3081 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
3082 'only_matching': True,
3083 }, {
3084 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
3085 'only_matching': True,
3086 }]
3087
3088 @classmethod
3089 def suitable(cls, url):
3090 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
3091 else super(YoutubeChannelIE, cls).suitable(url))
3092
3093 def _build_template_url(self, url, channel_id):
3094 return self._TEMPLATE_URL % channel_id
3095
3096 def _real_extract(self, url):
3097 channel_id = self._match_id(url)
3098
3099 url = self._build_template_url(url, channel_id)
3100
3101 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
3102 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
3103 # otherwise fallback on channel by page extraction
3104 channel_page = self._download_webpage(
3105 url + '?view=57', channel_id,
3106 'Downloading channel page', fatal=False)
3107 if channel_page is False:
3108 channel_playlist_id = False
3109 else:
3110 channel_playlist_id = self._html_search_meta(
3111 'channelId', channel_page, 'channel id', default=None)
3112 if not channel_playlist_id:
3113 channel_url = self._html_search_meta(
3114 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
3115 channel_page, 'channel url', default=None)
3116 if channel_url:
3117 channel_playlist_id = self._search_regex(
3118 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
3119 channel_url, 'channel id', default=None)
3120 if channel_playlist_id and channel_playlist_id.startswith('UC'):
3121 playlist_id = 'UU' + channel_playlist_id[2:]
3122 return self.url_result(
3123 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
3124
3125 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
3126 autogenerated = re.search(r'''(?x)
3127 class="[^"]*?(?:
3128 channel-header-autogenerated-label|
3129 yt-channel-title-autogenerated
3130 )[^"]*"''', channel_page) is not None
3131
3132 if autogenerated:
3133 # The videos are contained in a single page
3134 # the ajax pages can't be used, they are empty
3135 entries = [
3136 self.url_result(
3137 video_id, 'Youtube', video_id=video_id,
3138 video_title=video_title)
3139 for video_id, video_title in self.extract_videos_from_page(channel_page)]
3140 return self.playlist_result(entries, channel_id)
3141
3142 try:
3143 next(self._entries(channel_page, channel_id))
3144 except StopIteration:
3145 alert_message = self._html_search_regex(
3146 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3147 channel_page, 'alert', default=None, group='alert')
3148 if alert_message:
3149 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3150
3151 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3152
3153
3154class YoutubeUserIE(YoutubeChannelIE):
3155 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3156 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3157 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3158 IE_NAME = 'youtube:user'
3159
3160 _TESTS = [{
3161 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3162 'playlist_mincount': 320,
3163 'info_dict': {
3164 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3165 'title': 'Uploads from The Linux Foundation',
3166 'uploader': 'The Linux Foundation',
3167 'uploader_id': 'TheLinuxFoundation',
3168 }
3169 }, {
3170 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3171 # but not https://www.youtube.com/user/12minuteathlete/videos
3172 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3173 'playlist_mincount': 249,
3174 'info_dict': {
3175 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3176 'title': 'Uploads from 12 Minute Athlete',
3177 'uploader': '12 Minute Athlete',
3178 'uploader_id': 'the12minuteathlete',
3179 }
3180 }, {
3181 'url': 'ytuser:phihag',
3182 'only_matching': True,
3183 }, {
3184 'url': 'https://www.youtube.com/c/gametrailers',
3185 'only_matching': True,
3186 }, {
3187 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3188 'only_matching': True,
3189 }, {
3190 'url': 'https://www.youtube.com/gametrailers',
3191 'only_matching': True,
3192 }, {
3193 # This channel is not available, geo restricted to JP
3194 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3195 'only_matching': True,
3196 }]
3197
3198 @classmethod
3199 def suitable(cls, url):
3200 # Don't return True if the url can be extracted with other youtube
3201 # extractor, the regex would is too permissive and it would match.
3202 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3203 if any(ie.suitable(url) for ie in other_yt_ies):
3204 return False
3205 else:
3206 return super(YoutubeUserIE, cls).suitable(url)
3207
3208 def _build_template_url(self, url, channel_id):
3209 mobj = re.match(self._VALID_URL, url)
3210 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3211
3212
3213class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3214 IE_DESC = 'YouTube.com live streams'
3215 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3216 IE_NAME = 'youtube:live'
3217
3218 _TESTS = [{
3219 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3220 'info_dict': {
3221 'id': 'a48o2S1cPoo',
3222 'ext': 'mp4',
3223 'title': 'The Young Turks - Live Main Show',
3224 'uploader': 'The Young Turks',
3225 'uploader_id': 'TheYoungTurks',
3226 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3227 'upload_date': '20150715',
3228 'license': 'Standard YouTube License',
3229 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3230 'categories': ['News & Politics'],
3231 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3232 'like_count': int,
3233 'dislike_count': int,
3234 },
3235 'params': {
3236 'skip_download': True,
3237 },
3238 }, {
3239 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3240 'only_matching': True,
3241 }, {
3242 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3243 'only_matching': True,
3244 }, {
3245 'url': 'https://www.youtube.com/TheYoungTurks/live',
3246 'only_matching': True,
3247 }]
3248
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3251 channel_id = mobj.group('id')
3252 base_url = mobj.group('base_url')
3253 webpage = self._download_webpage(url, channel_id, fatal=False)
3254 if webpage:
3255 page_type = self._og_search_property(
3256 'type', webpage, 'page type', default='')
3257 video_id = self._html_search_meta(
3258 'videoId', webpage, 'video id', default=None)
3259 if page_type.startswith('video') and video_id and re.match(
3260 r'^[0-9A-Za-z_-]{11}$', video_id):
3261 return self.url_result(video_id, YoutubeIE.ie_key())
3262 return self.url_result(base_url)
3263
3264
3265class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3266 IE_DESC = 'YouTube.com user/channel playlists'
3267 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3268 IE_NAME = 'youtube:playlists'
3269
3270 _TESTS = [{
3271 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3272 'playlist_mincount': 4,
3273 'info_dict': {
3274 'id': 'ThirstForScience',
3275 'title': 'ThirstForScience',
3276 },
3277 }, {
3278 # with "Load more" button
3279 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3280 'playlist_mincount': 70,
3281 'info_dict': {
3282 'id': 'igorkle1',
3283 'title': 'Игорь Клейнер',
3284 },
3285 }, {
3286 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3287 'playlist_mincount': 17,
3288 'info_dict': {
3289 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3290 'title': 'Chem Player',
3291 },
3292 'skip': 'Blocked',
3293 }, {
3294 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3295 'only_matching': True,
3296 }]
3297
3298
3299class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
3300 IE_DESC = 'YouTube.com searches'
3301 # there doesn't appear to be a real limit, for example if you search for
3302 # 'python' you get more than 8.000.000 results
3303 _MAX_RESULTS = float('inf')
3304 IE_NAME = 'youtube:search'
3305 _SEARCH_KEY = 'ytsearch'
3306 _SEARCH_PARAMS = None
3307 _TESTS = []
3308
3309 def _entries(self, query, n):
3310 data = {
3311 'context': {
3312 'client': {
3313 'clientName': 'WEB',
3314 'clientVersion': '2.20201021.03.00',
3315 }
3316 },
3317 'query': query,
3318 }
3319 if self._SEARCH_PARAMS:
3320 data['params'] = self._SEARCH_PARAMS
3321 total = 0
3322 for page_num in itertools.count(1):
3323 search = self._download_json(
3324 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3325 video_id='query "%s"' % query,
3326 note='Downloading page %s' % page_num,
3327 errnote='Unable to download API page', fatal=False,
3328 data=json.dumps(data).encode('utf8'),
3329 headers={'content-type': 'application/json'})
3330 if not search:
3331 break
3332 slr_contents = try_get(
3333 search,
3334 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3335 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3336 list)
3337 if not slr_contents:
3338 break
3339 isr_contents = try_get(
3340 slr_contents,
3341 lambda x: x[0]['itemSectionRenderer']['contents'],
3342 list)
3343 if not isr_contents:
3344 break
3345 for content in isr_contents:
3346 if not isinstance(content, dict):
3347 continue
3348 video = content.get('videoRenderer')
3349 if not isinstance(video, dict):
3350 continue
3351 video_id = video.get('videoId')
3352 if not video_id:
3353 continue
3354 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3355 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3356 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3357 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3358 view_count = int_or_none(self._search_regex(
3359 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3360 'view count', default=None))
3361 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3362 total += 1
3363 yield {
3364 '_type': 'url_transparent',
3365 'ie_key': YoutubeIE.ie_key(),
3366 'id': video_id,
3367 'url': video_id,
3368 'title': title,
3369 'description': description,
3370 'duration': duration,
3371 'view_count': view_count,
3372 'uploader': uploader,
3373 }
3374 if total == n:
3375 return
3376 token = try_get(
3377 slr_contents,
3378 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3379 compat_str)
3380 if not token:
3381 break
3382 data['continuation'] = token
3383
3384 def _get_n_results(self, query, n):
3385 """Get a specified number of results for a query"""
3386 return self.playlist_result(self._entries(query, n), query)
3387
3388
3389class YoutubeSearchDateIE(YoutubeSearchIE):
3390 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3391 _SEARCH_KEY = 'ytsearchdate'
3392 IE_DESC = 'YouTube.com searches, newest videos first'
3393 _SEARCH_PARAMS = 'CAI%3D'
3394
3395
3396class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
3397 IE_DESC = 'YouTube.com search URLs'
3398 IE_NAME = 'youtube:search_url'
3399 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3400 _TESTS = [{
3401 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3402 'playlist_mincount': 5,
3403 'info_dict': {
3404 'title': 'youtube-dl test video',
3405 }
3406 }, {
3407 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3408 'only_matching': True,
3409 }]
3410
3411 def _process_json_dict(self, obj, videos, c):
3412 if "videoId" in obj:
3413 videos.append(obj)
3414 return
3415
3416 if "nextContinuationData" in obj:
3417 c["continuation"] = obj["nextContinuationData"]
3418 return
3419
3420 def _real_extract(self, url):
3421 mobj = re.match(self._VALID_URL, url)
3422 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3423 webpage = self._download_webpage(url, query)
3424 return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query)
3425
3426
3427class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3428 IE_DESC = 'YouTube.com (multi-season) shows'
3429 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3430 IE_NAME = 'youtube:show'
3431 _TESTS = [{
3432 'url': 'https://www.youtube.com/show/airdisasters',
3433 'playlist_mincount': 5,
3434 'info_dict': {
3435 'id': 'airdisasters',
3436 'title': 'Air Disasters',
3437 }
3438 }]
3439
3440 def _real_extract(self, url):
3441 playlist_id = self._match_id(url)
3442 return super(YoutubeShowIE, self)._real_extract(
3443 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3444
3445
3446class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3447 """
3448 Base class for feed extractors
3449 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3450 """
3451 _LOGIN_REQUIRED = True
3452
3453 @property
3454 def IE_NAME(self):
3455 return 'youtube:%s' % self._FEED_NAME
3456
3457 def _real_initialize(self):
3458 self._login()
3459
3460 def _process_entries(self, entries, seen):
3461 new_info = []
3462 for v in entries:
3463 v_id = try_get(v, lambda x: x['videoId'])
3464 if not v_id:
3465 continue
3466
3467 have_video = False
3468 for old in seen:
3469 if old['videoId'] == v_id:
3470 have_video = True
3471 break
3472
3473 if not have_video:
3474 new_info.append(v)
3475
3476 if not new_info:
3477 return
3478
3479 seen.extend(new_info)
3480 for video in new_info:
3481 yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
3482
3483 def _real_extract(self, url):
3484 page = self._download_webpage(
3485 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3486 self._PLAYLIST_TITLE)
3487 return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
3488 playlist_title=self._PLAYLIST_TITLE)
3489
3490
3491class YoutubeWatchLaterIE(YoutubePlaylistIE):
3492 IE_NAME = 'youtube:watchlater'
3493 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3494 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3495
3496 _TESTS = [{
3497 'url': 'https://www.youtube.com/playlist?list=WL',
3498 'only_matching': True,
3499 }, {
3500 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3501 'only_matching': True,
3502 }]
3503
3504 def _real_extract(self, url):
3505 _, video = self._check_download_just_video(url, 'WL')
3506 if video:
3507 return video
3508 _, playlist = self._extract_playlist('WL')
3509 return playlist
3510
3511
3512class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3513 IE_NAME = 'youtube:favorites'
3514 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3515 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3516 _LOGIN_REQUIRED = True
3517
3518 def _real_extract(self, url):
3519 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3520 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3521 return self.url_result(playlist_id, 'YoutubePlaylist')
3522
3523
3524class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3525 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3526 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3527 _FEED_NAME = 'recommended'
3528 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3529
3530
3531class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3532 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3533 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3534 _FEED_NAME = 'subscriptions'
3535 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3536
3537
3538class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3539 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3540 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3541 _FEED_NAME = 'history'
3542 _PLAYLIST_TITLE = 'Youtube History'
3543
3544
3545class YoutubeTruncatedURLIE(InfoExtractor):
3546 IE_NAME = 'youtube:truncated_url'
3547 IE_DESC = False # Do not list
3548 _VALID_URL = r'''(?x)
3549 (?:https?://)?
3550 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3551 (?:watch\?(?:
3552 feature=[a-z_]+|
3553 annotation_id=annotation_[^&]+|
3554 x-yt-cl=[0-9]+|
3555 hl=[^&]*|
3556 t=[0-9]+
3557 )?
3558 |
3559 attribution_link\?a=[^&]+
3560 )
3561 $
3562 '''
3563
3564 _TESTS = [{
3565 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3566 'only_matching': True,
3567 }, {
3568 'url': 'https://www.youtube.com/watch?',
3569 'only_matching': True,
3570 }, {
3571 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3572 'only_matching': True,
3573 }, {
3574 'url': 'https://www.youtube.com/watch?feature=foo',
3575 'only_matching': True,
3576 }, {
3577 'url': 'https://www.youtube.com/watch?hl=en-GB',
3578 'only_matching': True,
3579 }, {
3580 'url': 'https://www.youtube.com/watch?t=2372',
3581 'only_matching': True,
3582 }]
3583
3584 def _real_extract(self, url):
3585 raise ExtractorError(
3586 'Did you forget to quote the URL? Remember that & is a meta '
3587 'character in most shells, so you want to put the URL in quotes, '
3588 'like youtube-dl '
3589 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3590 ' or simply youtube-dl BaW_jenozKc .',
3591 expected=True)
3592
3593
3594class YoutubeTruncatedIDIE(InfoExtractor):
3595 IE_NAME = 'youtube:truncated_id'
3596 IE_DESC = False # Do not list
3597 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3598
3599 _TESTS = [{
3600 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3601 'only_matching': True,
3602 }]
3603
3604 def _real_extract(self, url):
3605 video_id = self._match_id(url)
3606 raise ExtractorError(
3607 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3608 expected=True)