X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/0bb1bc1b107b9c3d68ea0c887bd09cad75d7714d..301d07fc4bb37ae3bec607b62d52f3ee6c087df1:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 9ca81e6cb..44ec579c0 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,18 +2,21 @@ from __future__ import unicode_literals -import base64 import calendar import copy import datetime +import functools import hashlib import itertools import json +import math import os.path import random import re +import sys import time import traceback +import threading from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -28,7 +31,7 @@ ) from ..jsinterp import JSInterpreter from ..utils import ( - bytes_to_intlist, + bug_reports_message, clean_html, datetime_from_str, dict_get, @@ -37,10 +40,11 @@ float_or_none, format_field, int_or_none, - intlist_to_bytes, is_html, + join_nonempty, mimetype2ext, network_exceptions, + NO_DEFAULT, orderedSet, parse_codecs, parse_count, @@ -48,14 +52,17 @@ parse_iso8601, parse_qs, qualities, + remove_end, remove_start, smuggle_url, str_or_none, str_to_int, + strftime_or_none, traverse_obj, try_get, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, update_url_query, url_or_none, @@ -64,6 +71,10 @@ ) +def get_first(obj, keys, **kwargs): + return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) + + # any clients starting with _ cannot be explicity requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -71,7 +82,7 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20210622.10.00', + 'clientVersion': '2.20211221.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 @@ -81,7 +92,7 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20210620.0.1', + 'clientVersion': '1.20211215.00.01', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 @@ -92,115 +103,123 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211213.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, }, 'web_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211220.02.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, }, 'android': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '16.20', + 'clientVersion': '16.49', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False }, 'android_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '16.20', + 'clientVersion': '16.49', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 55 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, + 'REQUIRE_JS_PLAYER': False }, 'android_music': { - 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, + 'REQUIRE_JS_PLAYER': False }, 'android_creator': { + 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 14 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, + 'REQUIRE_JS_PLAYER': False }, - # ios has HLS live streams - # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680 + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 'ios': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', } }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 5 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_JS_PLAYER': False }, 'ios_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 66 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, + 'REQUIRE_JS_PLAYER': False }, 'ios_music': { - 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 26 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, + 'REQUIRE_JS_PLAYER': False }, 'ios_creator': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 15 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, + 'REQUIRE_JS_PLAYER': False }, # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 'mweb': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20210721.07.00', + 'clientVersion': '2.20211221.01.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 - }, + } } @@ -214,6 +233,7 @@ def build_innertube_clients(): for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') + ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) @@ -236,7 +256,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _RESERVED_NAMES = ( - r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|' + r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' r'shorts|movies|results|shared|hashtag|trending|feed|feeds|' r'browse|oembed|get_video_info|iframe_api|s/player|' r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') @@ -248,28 +268,78 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - r''' # Unused since login is broken - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - - _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' - _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' - _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' - ''' + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + r'(?:www\.)?invidious\.pussthecat\.org', + r'(?:www\.)?invidious\.zee\.li', + r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', + r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', + # youtube-dl invidious instances list + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + ) def _login(self): """ Attempt to log in to YouTube. - True is returned if successful or skipped. - False is returned if login failed. - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - def warn(message): - self.report_warning(message) - - # username+password login is broken if (self._LOGIN_REQUIRED and self.get_param('cookiefile') is None and self.get_param('cookiesfrombrowser') is None): @@ -277,184 +347,7 @@ def warn(message): 'Login details are needed to download this content', method='cookies') username, password = self._get_login_info() if username: - warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies']) - return - - # Everything below this is broken! - r''' - # No authentication to be performed - if username is None: - if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - # if self.get_param('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them. - # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!') - return True - - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='unable to fetch login page', fatal=False) - if login_page is False: - return - - login_form = self._hidden_inputs(login_page) - - def req(url, f_req, note, errnote): - data = login_form.copy() - data.update({ - 'pstMsg': 1, - 'checkConnection': 'youtube', - 'checkedDomains': 'youtube', - 'hl': 'en', - 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', - 'f.req': json.dumps(f_req), - 'flowName': 'GlifWebSignIn', - 'flowEntry': 'ServiceLogin', - # TODO: reverse actual botguard identifier generation algo - 'bgRequest': '["identifier",""]', - }) - return self._download_json( - url, None, note=note, errnote=errnote, - transform_source=lambda s: re.sub(r'^[^[]*', '', s), - fatal=False, - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', - 'Google-Accounts-XSRF': 1, - }) - - lookup_req = [ - username, - None, [], None, 'US', None, None, 2, False, True, - [ - None, None, - [2, 1, None, 1, - 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', - None, [], 4], - 1, [None, None, []], None, None, None, True - ], - username, - ] - - lookup_results = req( - self._LOOKUP_URL, lookup_req, - 'Looking up account info', 'Unable to look up account info') - - if lookup_results is False: - return False - - user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) - if not user_hash: - warn('Unable to extract user hash') - return False - - challenge_req = [ - user_hash, - None, 1, None, [1, None, None, None, [password, None, True]], - [ - None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True - ]] - - challenge_results = req( - self._CHALLENGE_URL, challenge_req, - 'Logging in', 'Unable to log in') - - if challenge_results is False: - return - - login_res = try_get(challenge_results, lambda x: x[0][5], list) - if login_res: - login_msg = try_get(login_res, lambda x: x[5], compat_str) - warn( - 'Unable to login: %s' % 'Invalid password' - if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) - return False - - res = try_get(challenge_results, lambda x: x[0][-1], list) - if not res: - warn('Unable to extract result entry') - return False - - login_challenge = try_get(res, lambda x: x[0][0], list) - if login_challenge: - challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) - if challenge_str == 'TWO_STEP_VERIFICATION': - # SEND_SUCCESS - TFA code has been successfully sent to phone - # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(login_challenge, lambda x: x[5], compat_str) - if status == 'QUOTA_EXCEEDED': - warn('Exceeded the limit of TFA codes, try later') - return False - - tl = try_get(challenge_results, lambda x: x[1][2], compat_str) - if not tl: - warn('Unable to extract TL') - return False - - tfa_code = self._get_tfa_info('2-step verification code') - - if not tfa_code: - warn( - 'Two-factor authentication required. Provide it either interactively or with --twofactor ' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False - - tfa_code = remove_start(tfa_code, 'G-') - - tfa_req = [ - user_hash, None, 2, None, - [ - 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] - ]] - - tfa_results = req( - self._TFA_URL.format(tl), tfa_req, - 'Submitting TFA code', 'Unable to submit TFA code') - - if tfa_results is False: - return False - - tfa_res = try_get(tfa_results, lambda x: x[0][5], list) - if tfa_res: - tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) - warn( - 'Unable to finish TFA: %s' % 'Invalid TFA code' - if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) - return False - - check_cookie_url = try_get( - tfa_results, lambda x: x[0][-1][2], compat_str) - else: - CHALLENGES = { - 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", - 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', - 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", - } - challenge = CHALLENGES.get( - challenge_str, - '%s returned error %s.' % (self.IE_NAME, challenge_str)) - warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) - return False - else: - check_cookie_url = try_get(res, lambda x: x[2], compat_str) - - if not check_cookie_url: - warn('Unable to extract CheckCookie URL') - return False - - check_cookie_results = self._download_webpage( - check_cookie_url, None, 'Checking cookie', fatal=False) - - if check_cookie_results is False: - return False - - if 'https://myaccount.google.com/' not in check_cookie_results: - warn('Unable to log in') - return False - - return True - ''' + self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}') def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') @@ -471,12 +364,22 @@ def _initialize_consent(self): consent_id = random.randint(100, 999) self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + def _real_initialize(self): + self._initialize_pref() self._initialize_consent() - if self._downloader is None: - return - if not self._login(): - return + self._login() _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' @@ -498,13 +401,6 @@ def _extract_client_name(self, ytcfg, default_client='web'): ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client) - @staticmethod - def _extract_session_index(*data): - for ytcfg in data: - session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX'])) - if session_index is not None: - return session_index - def _extract_client_version(self, ytcfg, default_client='web'): return self._ytcfg_get_safe( ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], @@ -514,23 +410,10 @@ def _extract_api_key(self, ytcfg=None, default_client='web'): return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): - _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) - context = _get_context(ytcfg) - if context: - return context - - context = _get_context(self._get_default_ytcfg(default_client)) - if not ytcfg: - return context - - # Recreate the client context (required) - context['client'].update({ - 'clientVersion': self._extract_client_version(ytcfg, default_client), - 'clientName': self._extract_client_name(ytcfg, default_client), - }) - visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) - if visitor_data: - context['client']['visitorData'] = visitor_data + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language for extraction + traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en' return context _SAPISID = None @@ -576,24 +459,34 @@ def _call_api(self, ep, query, video_id, fatal=True, headers=None, data=json.dumps(data).encode('utf8'), headers=real_headers, query={'key': api_key or self._extract_api_key()}) - def extract_yt_initial_data(self, video_id, webpage): - return self._parse_json( - self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), - video_id) + def extract_yt_initial_data(self, item_id, webpage, fatal=True): + data = self._search_regex( + (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE), + self._YT_INITIAL_DATA_RE), webpage, 'yt initial data', fatal=fatal) + if data: + return self._parse_json(data, item_id, fatal=fatal) - def _extract_identity_token(self, webpage, item_id): - if not webpage: - return None - ytcfg = self.extract_ytcfg(item_id, webpage) + @staticmethod + def _extract_session_index(*data): + """ + Index of current account in account list. + See: https://github.com/yt-dlp/yt-dlp/pull/519 + """ + for ytcfg in data: + session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX'])) + if session_index is not None: + return session_index + + # Deprecated? + def _extract_identity_token(self, ytcfg=None, webpage=None): if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: return token - return self._search_regex( - r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, - 'identity token', default=None) + if webpage: + return self._search_regex( + r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, + 'identity token', default=None, fatal=False) @staticmethod def _extract_account_syncid(*args): @@ -608,12 +501,26 @@ def _extract_account_syncid(*args): return delegated_sid sync_ids = (try_get( data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), compat_str) or '').split("||") + lambda x: x['DATASYNC_ID']), compat_str) or '').split('||') if len(sync_ids) >= 2 and sync_ids[1]: # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel # and just "user_syncid||" for primary channel. We only want the channel_syncid return sync_ids[0] + @staticmethod + def _extract_visitor_data(*args): + """ + Extracts visitorData from an API response or ytcfg + Appears to be used to track session state + """ + return get_first( + args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), + expected_type=str) + + @property + def is_authenticated(self): + return bool(self._generate_sapisidhash_header()) + def extract_ytcfg(self, video_id, webpage): if not webpage: return {} @@ -623,33 +530,29 @@ def extract_ytcfg(self, video_id, webpage): default='{}'), video_id, fatal=False) or {} def generate_api_headers( - self, ytcfg=None, identity_token=None, account_syncid=None, - visitor_data=None, api_hostname=None, default_client='web', session_index=None): + self, *, ytcfg=None, account_syncid=None, session_index=None, + visitor_data=None, identity_token=None, api_hostname=None, default_client='web'): + origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client)) headers = { 'X-YouTube-Client-Name': compat_str( self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), - 'Origin': origin + 'Origin': origin, + 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), + 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg) } - if not visitor_data and ytcfg: - visitor_data = try_get( - self._extract_context(ytcfg, default_client), lambda x: x['client']['visitorData'], compat_str) - if identity_token: - headers['X-Youtube-Identity-Token'] = identity_token - if account_syncid: - headers['X-Goog-PageId'] = account_syncid - if session_index is None and ytcfg: + if session_index is None: session_index = self._extract_session_index(ytcfg) if account_syncid or session_index is not None: headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 - if visitor_data: - headers['X-Goog-Visitor-Id'] = visitor_data + auth = self._generate_sapisidhash_header(origin) if auth is not None: headers['Authorization'] = auth headers['X-Origin'] = origin - return headers + return {h: v for h, v in headers.items() if v is not None} @staticmethod def _build_api_continuation_query(continuation, ctp=None): @@ -720,7 +623,7 @@ def _extract_alerts(cls, data): if message: yield alert_type, message - def _report_alerts(self, alerts, expected=True, fatal=True): + def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): errors = [] warnings = [] for alert_type, alert_message in alerts: @@ -730,7 +633,7 @@ def _report_alerts(self, alerts, expected=True, fatal=True): warnings.append([alert_type, alert_message]) for alert_type, alert_message in (warnings + errors[:-1]): - self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message)) + self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message), only_once=only_once) if errors: raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected) @@ -767,6 +670,71 @@ def _get_text(data, *path_list, max_runs=None): if text: return text + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + + @staticmethod + def _extract_thumbnails(data, *path_list): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + """ + mobj = re.search(r'(?Ptoday|yesterday|now)|(?P