X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/b28f8d244a3cb901eff5be438ad8f0508bd8e024..301d07fc4bb37ae3bec607b62d52f3ee6c087df1:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 3e19d18a0..44ec579c0 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,14 +2,21 @@ from __future__ import unicode_literals +import calendar +import copy +import datetime +import functools import hashlib import itertools import json +import math import os.path import random import re +import sys import time import traceback +import threading from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -24,419 +31,295 @@ ) from ..jsinterp import JSInterpreter from ..utils import ( - bool_or_none, + bug_reports_message, clean_html, + datetime_from_str, dict_get, + error_to_compat_str, ExtractorError, - format_field, float_or_none, + format_field, int_or_none, + is_html, + join_nonempty, mimetype2ext, + network_exceptions, + NO_DEFAULT, + orderedSet, parse_codecs, + parse_count, parse_duration, + parse_iso8601, + parse_qs, qualities, + remove_end, remove_start, smuggle_url, str_or_none, str_to_int, + strftime_or_none, + traverse_obj, try_get, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, update_url_query, url_or_none, - urlencode_postdata, urljoin, + variadic, ) -class YoutubeBaseInfoExtractor(InfoExtractor): - """Provide base functions for Youtube extractors""" - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - - _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' - _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' - _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' - - _RESERVED_NAMES = ( - r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|' - r'movies|results|shared|hashtag|trending|feed|feeds|' - r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') - - _NETRC_MACHINE = 'youtube' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = False - - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' - - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - - def _login(self): - """ - Attempt to log in to YouTube. - True is returned if successful or skipped. - False is returned if login failed. - - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - username, password = self._get_login_info() - # No authentication to be performed - if username is None: - if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - # if self._downloader.params.get('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them. - # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!') - return True - - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='unable to fetch login page', fatal=False) - if login_page is False: - return - - login_form = self._hidden_inputs(login_page) - - def req(url, f_req, note, errnote): - data = login_form.copy() - data.update({ - 'pstMsg': 1, - 'checkConnection': 'youtube', - 'checkedDomains': 'youtube', - 'hl': 'en', - 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', - 'f.req': json.dumps(f_req), - 'flowName': 'GlifWebSignIn', - 'flowEntry': 'ServiceLogin', - # TODO: reverse actual botguard identifier generation algo - 'bgRequest': '["identifier",""]', - }) - return self._download_json( - url, None, note=note, errnote=errnote, - transform_source=lambda s: re.sub(r'^[^[]*', '', s), - fatal=False, - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', - 'Google-Accounts-XSRF': 1, - }) - - def warn(message): - self._downloader.report_warning(message) - - lookup_req = [ - username, - None, [], None, 'US', None, None, 2, False, True, - [ - None, None, - [2, 1, None, 1, - 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', - None, [], 4], - 1, [None, None, []], None, None, None, True - ], - username, - ] - - lookup_results = req( - self._LOOKUP_URL, lookup_req, - 'Looking up account info', 'Unable to look up account info') - - if lookup_results is False: - return False - - user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) - if not user_hash: - warn('Unable to extract user hash') - return False - - challenge_req = [ - user_hash, - None, 1, None, [1, None, None, None, [password, None, True]], - [ - None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True - ]] - - challenge_results = req( - self._CHALLENGE_URL, challenge_req, - 'Logging in', 'Unable to log in') - - if challenge_results is False: - return - - login_res = try_get(challenge_results, lambda x: x[0][5], list) - if login_res: - login_msg = try_get(login_res, lambda x: x[5], compat_str) - warn( - 'Unable to login: %s' % 'Invalid password' - if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) - return False - - res = try_get(challenge_results, lambda x: x[0][-1], list) - if not res: - warn('Unable to extract result entry') - return False - - login_challenge = try_get(res, lambda x: x[0][0], list) - if login_challenge: - challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) - if challenge_str == 'TWO_STEP_VERIFICATION': - # SEND_SUCCESS - TFA code has been successfully sent to phone - # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(login_challenge, lambda x: x[5], compat_str) - if status == 'QUOTA_EXCEEDED': - warn('Exceeded the limit of TFA codes, try later') - return False - - tl = try_get(challenge_results, lambda x: x[1][2], compat_str) - if not tl: - warn('Unable to extract TL') - return False - - tfa_code = self._get_tfa_info('2-step verification code') - - if not tfa_code: - warn( - 'Two-factor authentication required. Provide it either interactively or with --twofactor ' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False - - tfa_code = remove_start(tfa_code, 'G-') - - tfa_req = [ - user_hash, None, 2, None, - [ - 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] - ]] - - tfa_results = req( - self._TFA_URL.format(tl), tfa_req, - 'Submitting TFA code', 'Unable to submit TFA code') - - if tfa_results is False: - return False - - tfa_res = try_get(tfa_results, lambda x: x[0][5], list) - if tfa_res: - tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) - warn( - 'Unable to finish TFA: %s' % 'Invalid TFA code' - if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) - return False - - check_cookie_url = try_get( - tfa_results, lambda x: x[0][-1][2], compat_str) - else: - CHALLENGES = { - 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", - 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', - 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", - } - challenge = CHALLENGES.get( - challenge_str, - '%s returned error %s.' % (self.IE_NAME, challenge_str)) - warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) - return False - else: - check_cookie_url = try_get(res, lambda x: x[2], compat_str) - - if not check_cookie_url: - warn('Unable to extract CheckCookie URL') - return False - - check_cookie_results = self._download_webpage( - check_cookie_url, None, 'Checking cookie', fatal=False) - - if check_cookie_results is False: - return False - - if 'https://myaccount.google.com/' not in check_cookie_results: - warn('Unable to log in') - return False - - return True - - def _initialize_consent(self): - cookies = self._get_cookies('https://www.youtube.com/') - if cookies.get('__Secure-3PSID'): - return - consent_id = None - consent = cookies.get('CONSENT') - if consent: - if 'YES' in consent.value: - return - consent_id = self._search_regex( - r'PENDING\+(\d+)', consent.value, 'consent', default=None) - if not consent_id: - consent_id = random.randint(100, 999) - self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) +def get_first(obj, keys, **kwargs): + return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) - def _real_initialize(self): - self._initialize_consent() - if self._downloader is None: - return - if not self._login(): - return - _YT_WEB_CLIENT_VERSION = '2.20210301.08.00' - _DEFAULT_API_DATA = { - 'context': { +# any clients starting with _ cannot be explicity requested by the user +INNERTUBE_CLIENTS = { + 'web': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': _YT_WEB_CLIENT_VERSION, + 'clientVersion': '2.20211221.00.00', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 + }, + 'web_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_EMBEDDED_PLAYER', + 'clientVersion': '1.20211215.00.01', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 + }, + 'web_music': { + 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', + 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_REMIX', + 'clientVersion': '1.20211213.00.00', } }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, + }, + 'web_creator': { + 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_CREATOR', + 'clientVersion': '1.20211220.02.00', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, + }, + 'android': { + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID', + 'clientVersion': '16.49', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False + }, + 'android_embedded': { + 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_EMBEDDED_PLAYER', + 'clientVersion': '16.49', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, + 'REQUIRE_JS_PLAYER': False + }, + 'android_music': { + 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_MUSIC', + 'clientVersion': '4.57', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, + 'REQUIRE_JS_PLAYER': False + }, + 'android_creator': { + 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_CREATOR', + 'clientVersion': '21.47', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, + 'REQUIRE_JS_PLAYER': False + }, + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 + 'ios': { + 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_JS_PLAYER': False + }, + 'ios_embedded': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_MESSAGES_EXTENSION', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, + 'REQUIRE_JS_PLAYER': False + }, + 'ios_music': { + 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_MUSIC', + 'clientVersion': '4.57', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, + 'REQUIRE_JS_PLAYER': False + }, + 'ios_creator': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS_CREATOR', + 'clientVersion': '21.47', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, + 'REQUIRE_JS_PLAYER': False + }, + # mweb has 'ultralow' formats + # See: https://github.com/yt-dlp/yt-dlp/pull/557 + 'mweb': { + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'MWEB', + 'clientVersion': '2.20211221.01.00', + } + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 } +} - _DEFAULT_BASIC_API_HEADERS = { - 'X-YouTube-Client-Name': '1', - 'X-YouTube-Client-Version': _YT_WEB_CLIENT_VERSION + +def build_innertube_clients(): + third_party = { + 'embedUrl': 'https://google.com', # Can be any valid URL } + base_clients = ('android', 'web', 'ios', 'mweb') + priority = qualities(base_clients[::-1]) + + for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): + ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') + ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') + ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') + ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) + + if client in base_clients: + INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) + agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' + agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party + agegate_ytcfg['priority'] -= 1 + elif client.endswith('_embedded'): + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party + ytcfg['priority'] -= 2 + else: + ytcfg['priority'] -= 3 - _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' - _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' - _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|= 2 and sync_ids[1]: - # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel - # and just "user_syncid||" for primary channel. We only want the channel_syncid - return sync_ids[0] - - def _extract_ytcfg(self, video_id, webpage): - return self._parse_json( - self._search_regex( - r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), video_id, fatal=False) + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' - def _extract_video(self, renderer): - video_id = renderer.get('videoId') - title = try_get( - renderer, - (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) - description = try_get( - renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], - compat_str) - duration = parse_duration(try_get( - renderer, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get( - renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get( - renderer, - (lambda x: x['ownerText']['runs'][0]['text'], - lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) - return { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } + _NETRC_MACHINE = 'youtube' + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = False -class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' _INVIDIOUS_SITES = ( # invidious-redirect websites r'(?:www\.)?redirect\.invidious\.io', r'(?:(?:www|dev)\.)?invidio\.us', # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md r'(?:www\.)?invidious\.pussthecat\.org', - r'(?:www\.)?invidious\.048596\.xyz', r'(?:www\.)?invidious\.zee\.li', - r'(?:www\.)?vid\.puffyan\.us', - r'(?:(?:www|au)\.)?ytprivate\.com', - r'(?:www\.)?invidious\.namazso\.eu', r'(?:www\.)?invidious\.ethibox\.fr', - r'(?:www\.)?inv\.skyn3t\.in', - r'(?:www\.)?invidious\.himiko\.cloud', - r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', - r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', - r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', + r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', # youtube-dl invidious instances list r'(?:(?:www|no)\.)?invidiou\.sh', r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', r'(?:www\.)?invidious\.kabi\.tk', - r'(?:www\.)?invidious\.13ad\.de', r'(?:www\.)?invidious\.mastodon\.host', r'(?:www\.)?invidious\.zapashcanon\.fr', - r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', r'(?:www\.)?invidious\.tube', r'(?:www\.)?invidiou\.site', r'(?:www\.)?invidious\.site', r'(?:www\.)?invidious\.xyz', r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', r'(?:www\.)?tube\.poal\.co', r'(?:www\.)?tube\.connect\.cafe', r'(?:www\.)?vid\.wxzm\.sx', r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', r'(?:www\.)?yewtu\.be', r'(?:www\.)?yt\.elukerio\.org', r'(?:www\.)?yt\.lelux\.fi', r'(?:www\.)?invidious\.ggc-project\.de', r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', r'(?:www\.)?invidious\.toot\.koeln', r'(?:www\.)?invidious\.fdn\.fr', r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', r'(?:www\.)?kgg2m7yk5aybusll\.onion', r'(?:www\.)?qklhadlycap4cnod\.onion', r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', @@ -445,81 +328,596 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', ) - _VALID_URL = r"""(?x)^ - ( - (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com| - (?:www\.)?deturl\.com/www\.youtube\.com| - (?:www\.)?pwnyoutube\.com| - (?:www\.)?hooktube\.com| - (?:www\.)?yourepeat\.com| - tube\.majestyc\.net| - %(invidious)s| - youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) - v= - ) - )) - |(?: - youtu\.be| # just youtu.be/xxxx - vid\.plus| # or vid.plus/xxxx - zwearz\.com/watch| # or zwearz.com/watch/xxxx - %(invidious)s - )/ - |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= - ) - )? # all until now is optional -> you can pass the naked ID - (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) - (?(1).+)? # if we found the ID, everything can follow - $""" % { - 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, - 'invidious': '|'.join(_INVIDIOUS_SITES), - } - _PLAYER_INFO_RE = ( - r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', - r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', - r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', - ) - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + def _login(self): + """ + Attempt to log in to YouTube. + If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. + """ - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + if (self._LOGIN_REQUIRED + and self.get_param('cookiefile') is None + and self.get_param('cookiesfrombrowser') is None): + self.raise_login_required( + 'Login details are needed to download this content', method='cookies') + username, password = self._get_login_info() + if username: + self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}') + + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + consent_id = None + consent = cookies.get('CONSENT') + if consent: + if 'YES' in consent.value: + return + consent_id = self._search_regex( + r'PENDING\+(\d+)', consent.value, 'consent', default=None) + if not consent_id: + consent_id = random.randint(100, 999) + self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + + def _real_initialize(self): + self._initialize_pref() + self._initialize_consent() + self._login() + + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' + _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' + _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|= 2 and sync_ids[1]: + # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel + # and just "user_syncid||" for primary channel. We only want the channel_syncid + return sync_ids[0] + + @staticmethod + def _extract_visitor_data(*args): + """ + Extracts visitorData from an API response or ytcfg + Appears to be used to track session state + """ + return get_first( + args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), + expected_type=str) + + @property + def is_authenticated(self): + return bool(self._generate_sapisidhash_header()) + + def extract_ytcfg(self, video_id, webpage): + if not webpage: + return {} + return self._parse_json( + self._search_regex( + r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', + default='{}'), video_id, fatal=False) or {} + + def generate_api_headers( + self, *, ytcfg=None, account_syncid=None, session_index=None, + visitor_data=None, identity_token=None, api_hostname=None, default_client='web'): + + origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client)) + headers = { + 'X-YouTube-Client-Name': compat_str( + self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), + 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), + 'Origin': origin, + 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), + 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg) + } + if session_index is None: + session_index = self._extract_session_index(ytcfg) + if account_syncid or session_index is not None: + headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 + + auth = self._generate_sapisidhash_header(origin) + if auth is not None: + headers['Authorization'] = auth + headers['X-Origin'] = origin + return {h: v for h, v in headers.items() if v is not None} + + @staticmethod + def _build_api_continuation_query(continuation, ctp=None): + query = { + 'continuation': continuation + } + # TODO: Inconsistency with clickTrackingParams. + # Currently we have a fixed ctp contained within context (from ytcfg) + # and a ctp in root query for continuation. + if ctp: + query['clickTracking'] = {'clickTrackingParams': ctp} + return query + + @classmethod + def _extract_next_continuation_data(cls, renderer): + next_continuation = try_get( + renderer, (lambda x: x['continuations'][0]['nextContinuationData'], + lambda x: x['continuation']['reloadContinuationData']), dict) + if not next_continuation: + return + continuation = next_continuation.get('continuation') + if not continuation: + return + ctp = next_continuation.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation_ep_data(cls, continuation_ep: dict): + if isinstance(continuation_ep, dict): + continuation = try_get( + continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + if not continuation: + return + ctp = continuation_ep.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation(cls, renderer): + next_continuation = cls._extract_next_continuation_data(renderer) + if next_continuation: + return next_continuation + + contents = [] + for key in ('contents', 'items'): + contents.extend(try_get(renderer, lambda x: x[key], list) or []) + + for content in contents: + if not isinstance(content, dict): + continue + continuation_ep = try_get( + content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'], + lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']), + dict) + continuation = cls._extract_continuation_ep_data(continuation_ep) + if continuation: + return continuation + + @classmethod + def _extract_alerts(cls, data): + for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: + if not isinstance(alert_dict, dict): + continue + for alert in alert_dict.values(): + alert_type = alert.get('type') + if not alert_type: + continue + message = cls._get_text(alert, 'text') + if message: + yield alert_type, message + + def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): + errors = [] + warnings = [] + for alert_type, alert_message in alerts: + if alert_type.lower() == 'error' and fatal: + errors.append([alert_type, alert_message]) + else: + warnings.append([alert_type, alert_message]) + + for alert_type, alert_message in (warnings + errors[:-1]): + self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message), only_once=only_once) + if errors: + raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected) + + def _extract_and_report_alerts(self, data, *args, **kwargs): + return self._report_alerts(self._extract_alerts(data), *args, **kwargs) + + def _extract_badges(self, renderer: dict): + badges = set() + for badge in try_get(renderer, lambda x: x['badges'], list) or []: + label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str) + if label: + badges.add(label.lower()) + return badges + + @staticmethod + def _get_text(data, *path_list, max_runs=None): + for path in path_list or [None]: + if path is None: + obj = [data] + else: + obj = traverse_obj(data, path, default=[]) + if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): + obj = [obj] + for item in obj: + text = try_get(item, lambda x: x['simpleText'], compat_str) + if text: + return text + runs = try_get(item, lambda x: x['runs'], list) or [] + if not runs and isinstance(item, list): + runs = item + + runs = runs[:min(len(runs), max_runs or len(runs))] + text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[])) + if text: + return text + + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + + @staticmethod + def _extract_thumbnails(data, *path_list): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + """ + mobj = re.search(r'(?Ptoday|yesterday|now)|(?P