X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/37e57a9fd48a3f01be0cc7b510aaac6e534bd27f..4a3175fc4cff22343bd23c6cb7d40dbd7d0ccbf5:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 20452bb70..8ee688798 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -5,6 +5,7 @@ import calendar import copy import datetime +import functools import hashlib import itertools import json @@ -15,6 +16,7 @@ import sys import time import traceback +import threading from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -37,9 +39,11 @@ ExtractorError, float_or_none, format_field, + get_first, int_or_none, is_html, join_nonempty, + js_to_json, mimetype2ext, network_exceptions, NO_DEFAULT, @@ -55,10 +59,12 @@ smuggle_url, str_or_none, str_to_int, + strftime_or_none, traverse_obj, try_get, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, update_url_query, url_or_none, @@ -67,10 +73,6 @@ ) -def get_first(obj, keys, **kwargs): - return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) - - # any clients starting with _ cannot be explicity requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -78,7 +80,7 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20210622.10.00', + 'clientVersion': '2.20211221.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 @@ -88,7 +90,7 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20210620.0.1', + 'clientVersion': '1.20211215.00.01', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 @@ -99,96 +101,96 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211213.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, }, 'web_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211220.02.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, }, 'android': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '16.20', + 'clientVersion': '16.49', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False }, 'android_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '16.20', + 'clientVersion': '16.49', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, 'REQUIRE_JS_PLAYER': False }, 'android_music': { - 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False }, 'android_creator': { + 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False }, - # ios has HLS live streams - # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680 + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 'ios': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, 'REQUIRE_JS_PLAYER': False }, 'ios_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, 'REQUIRE_JS_PLAYER': False }, 'ios_music': { - 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -198,7 +200,7 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, @@ -207,39 +209,41 @@ def get_first(obj, keys, **kwargs): # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 'mweb': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20210721.07.00', + 'clientVersion': '2.20211221.01.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 - }, + } } def build_innertube_clients(): - third_party = { + THIRD_PARTY = { 'embedUrl': 'https://google.com', # Can be any valid URL } - base_clients = ('android', 'web', 'ios', 'mweb') - priority = qualities(base_clients[::-1]) + BASE_CLIENTS = ('android', 'web', 'ios', 'mweb') + priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') - ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) - if client in base_clients: + base_client, *variant = client.split('_') + ytcfg['priority'] = 10 * priority(base_client) + + if not variant: INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party + agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY agegate_ytcfg['priority'] -= 1 - elif client.endswith('_embedded'): - ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party + elif variant == ['embedded']: + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY ytcfg['priority'] -= 2 else: ytcfg['priority'] -= 3 @@ -253,13 +257,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' - r'shorts|movies|results|shared|hashtag|trending|feed|feeds|' + r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' r'browse|oembed|get_video_info|iframe_api|s/player|' r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' - _NETRC_MACHINE = 'youtube' + # _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -273,6 +277,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?invidious\.zee\.li', r'(?:www\.)?invidious\.ethibox\.fr', r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', + r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', # youtube-dl invidious instances list r'(?:(?:www|no)\.)?invidiou\.sh', r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', @@ -328,21 +334,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', ) - def _login(self): - """ - Attempt to log in to YouTube. - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - - if (self._LOGIN_REQUIRED - and self.get_param('cookiefile') is None - and self.get_param('cookiesfrombrowser') is None): - self.raise_login_required( - 'Login details are needed to download this content', method='cookies') - username, password = self._get_login_info() - if username: - self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}') - def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): @@ -358,9 +349,25 @@ def _initialize_consent(self): consent_id = random.randint(100, 999) self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en', 'tz': 'UTC'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + def _real_initialize(self): + self._initialize_pref() self._initialize_consent() - self._login() + if (self._LOGIN_REQUIRED + and self.get_param('cookiefile') is None + and self.get_param('cookiesfrombrowser') is None): + self.raise_login_required('Login details are needed to download this content', method='cookies') _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' @@ -391,23 +398,11 @@ def _extract_api_key(self, ytcfg=None, default_client='web'): return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): - _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) - context = _get_context(ytcfg) - if context: - return context - - context = _get_context(self._get_default_ytcfg(default_client)) - if not ytcfg: - return context - - # Recreate the client context (required) - context['client'].update({ - 'clientVersion': self._extract_client_version(ytcfg, default_client), - 'clientName': self._extract_client_name(ytcfg, default_client), - }) - visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) - if visitor_data: - context['client']['visitorData'] = visitor_data + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language and tz for extraction + client_context = traverse_obj(context, 'client', expected_type=dict, default={}) + client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) return context _SAPISID = None @@ -451,7 +446,7 @@ def _call_api(self, ep, query, video_id, fatal=True, headers=None, 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, data=json.dumps(data).encode('utf8'), headers=real_headers, - query={'key': api_key or self._extract_api_key()}) + query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'}) def extract_yt_initial_data(self, item_id, webpage, fatal=True): data = self._search_regex( @@ -508,7 +503,7 @@ def _extract_visitor_data(*args): Appears to be used to track session state """ return get_first( - args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), + args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) @property @@ -664,6 +659,72 @@ def _get_text(data, *path_list, max_runs=None): if text: return text + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + + @staticmethod + def _extract_thumbnails(data, *path_list): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + """ + mobj = re.search(r'(?Ptoday|yesterday|now)|(?P