X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/b29165267f931c1135dabb7986b253f9c5673c52..1108613f021eea0f6d4c5786c94db98641af6d59:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e4854bead..636bf42b6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,10 +2,10 @@ from __future__ import unicode_literals -import base64 import calendar import copy import datetime +import functools import hashlib import itertools import json @@ -13,8 +13,10 @@ import os.path import random import re +import sys import time import traceback +import threading from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -30,7 +32,6 @@ from ..jsinterp import JSInterpreter from ..utils import ( bug_reports_message, - bytes_to_intlist, clean_html, datetime_from_str, dict_get, @@ -39,9 +40,9 @@ float_or_none, format_field, int_or_none, - intlist_to_bytes, is_html, join_nonempty, + js_to_json, mimetype2ext, network_exceptions, NO_DEFAULT, @@ -57,10 +58,12 @@ smuggle_url, str_or_none, str_to_int, + strftime_or_none, traverse_obj, try_get, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, update_url_query, url_or_none, @@ -80,7 +83,7 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20210622.10.00', + 'clientVersion': '2.20211221.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 @@ -90,7 +93,7 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20210620.0.1', + 'clientVersion': '1.20211215.00.01', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 @@ -101,96 +104,96 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211213.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, }, 'web_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20210621.00.00', + 'clientVersion': '1.20211220.02.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, }, 'android': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '16.20', + 'clientVersion': '16.49', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False }, 'android_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '16.20', + 'clientVersion': '16.49', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, 'REQUIRE_JS_PLAYER': False }, 'android_music': { - 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False }, 'android_creator': { + 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False }, - # ios has HLS live streams - # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680 + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 'ios': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, 'REQUIRE_JS_PLAYER': False }, 'ios_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '16.20', + 'clientVersion': '16.46', + 'deviceModel': 'iPhone14,3', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, 'REQUIRE_JS_PLAYER': False }, 'ios_music': { - 'INNERTUBE_API_KEY': 'AIzaSyDK3iBpDP9nHVTk2qL73FLJICfOC3c51Og', - 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '4.32', + 'clientVersion': '4.57', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -200,7 +203,7 @@ def get_first(obj, keys, **kwargs): 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '21.24.100', + 'clientVersion': '21.47', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, @@ -209,41 +212,41 @@ def get_first(obj, keys, **kwargs): # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 'mweb': { - 'INNERTUBE_API_KEY': 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8', + 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20210721.07.00', + 'clientVersion': '2.20211221.01.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 - }, + } } def build_innertube_clients(): - third_party = { + THIRD_PARTY = { 'embedUrl': 'https://google.com', # Can be any valid URL } - base_clients = ('android', 'web', 'ios', 'mweb') - priority = qualities(base_clients[::-1]) + BASE_CLIENTS = ('android', 'web', 'ios', 'mweb') + priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') - ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) - if client in base_clients: - INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) + base_client, *variant = client.split('_') + ytcfg['priority'] = 10 * priority(base_client) + + if variant == ['embedded']: + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY + INNERTUBE_CLIENTS[f'{base_client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party agegate_ytcfg['priority'] -= 1 - elif client.endswith('_embedded'): - ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party ytcfg['priority'] -= 2 - else: + elif variant: ytcfg['priority'] -= 3 @@ -255,7 +258,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' - r'shorts|movies|results|shared|hashtag|trending|feed|feeds|' + r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' r'browse|oembed|get_video_info|iframe_api|s/player|' r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') @@ -275,6 +278,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?invidious\.zee\.li', r'(?:www\.)?invidious\.ethibox\.fr', r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', + r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', # youtube-dl invidious instances list r'(?:(?:www|no)\.)?invidiou\.sh', r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', @@ -360,7 +365,20 @@ def _initialize_consent(self): consent_id = random.randint(100, 999) self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en', 'tz': 'UTC'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + def _real_initialize(self): + self._initialize_pref() self._initialize_consent() self._login() @@ -393,23 +411,11 @@ def _extract_api_key(self, ytcfg=None, default_client='web'): return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): - _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) - context = _get_context(ytcfg) - if context: - return context - - context = _get_context(self._get_default_ytcfg(default_client)) - if not ytcfg: - return context - - # Recreate the client context (required) - context['client'].update({ - 'clientVersion': self._extract_client_version(ytcfg, default_client), - 'clientName': self._extract_client_name(ytcfg, default_client), - }) - visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) - if visitor_data: - context['client']['visitorData'] = visitor_data + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language and tz for extraction + client_context = traverse_obj(context, 'client', expected_type=dict, default={}) + client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) return context _SAPISID = None @@ -510,7 +516,7 @@ def _extract_visitor_data(*args): Appears to be used to track session state """ return get_first( - args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), + args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) @property @@ -666,6 +672,72 @@ def _get_text(data, *path_list, max_runs=None): if text: return text + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + + @staticmethod + def _extract_thumbnails(data, *path_list): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + """ + mobj = re.search(r'(?Ptoday|yesterday|now)|(?P