X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/05799a48c7dec12b34c8bf951c8d2eceedda59f8..24f3097ea9a470a984d0454dc013cafa2325f5f8:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 01e2e3793..094b1e9a3 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,7 +2,7 @@ import calendar import collections import copy -import datetime +import datetime as dt import enum import hashlib import itertools @@ -11,17 +11,18 @@ import os.path import random import re +import shlex import sys import threading import time import traceback -import urllib.error import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper from ..compat import functools from ..jsinterp import JSInterpreter +from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, ExtractorError, @@ -32,6 +33,7 @@ clean_html, datetime_from_str, dict_get, + filesize_from_tbr, filter_dict, float_or_none, format_field, @@ -41,7 +43,6 @@ join_nonempty, js_to_json, mimetype2ext, - network_exceptions, orderedSet, parse_codecs, parse_count, @@ -55,6 +56,7 @@ str_to_int, strftime_or_none, traverse_obj, + try_call, try_get, unescapeHTML, unified_strdate, @@ -66,6 +68,7 @@ variadic, ) +STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -74,9 +77,9 @@ 'client': { 'clientName': 'WEB', 'clientVersion': '2.20220801.00.00', - } + }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, }, 'web_embedded': { 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', @@ -86,7 +89,7 @@ 'clientVersion': '1.20220731.00.00', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, }, 'web_music': { 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', @@ -95,7 +98,7 @@ 'client': { 'clientName': 'WEB_REMIX', 'clientVersion': '1.20220727.01.00', - } + }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, }, @@ -105,7 +108,7 @@ 'client': { 'clientName': 'WEB_CREATOR', 'clientVersion': '1.20220726.00.00', - } + }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, }, @@ -114,39 +117,39 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '17.31.35', + 'clientVersion': '19.09.37', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' - } + 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip', + }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, 'android_embedded': { 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '17.31.35', + 'clientVersion': '19.09.37', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' + 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, 'android_music': { 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '5.16.51', + 'clientVersion': '6.42.52', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.music/5.16.51 (Linux; U; Android 11) gzip' - } + 'userAgent': 'com.google.android.apps.youtube.music/6.42.52 (Linux; U; Android 11) gzip', + }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, 'android_creator': { 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', @@ -155,11 +158,11 @@ 'clientName': 'ANDROID_CREATOR', 'clientVersion': '22.30.100', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.creator/22.30.100 (Linux; U; Android 11) gzip' + 'userAgent': 'com.google.android.apps.youtube.creator/22.30.100 (Linux; U; Android 11) gzip', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, # iOS clients have HLS live streams. Setting device model to get 60fps formats. # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 @@ -168,38 +171,38 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '17.33.2', + 'clientVersion': '19.09.3', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' - } + 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, 'ios_embedded': { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '17.33.2', + 'clientVersion': '19.09.3', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, 'ios_music': { 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '5.21', + 'clientVersion': '6.33.3', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtubemusic/5.21 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.youtubemusic/6.33.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, 'ios_creator': { 'INNERTUBE_CONTEXT': { @@ -207,11 +210,11 @@ 'clientName': 'IOS_CREATOR', 'clientVersion': '22.33.101', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.ytcreator/22.33.101 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.ytcreator/22.33.101 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, - 'REQUIRE_JS_PLAYER': False + 'REQUIRE_JS_PLAYER': False, }, # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 @@ -221,9 +224,9 @@ 'client': { 'clientName': 'MWEB', 'clientVersion': '2.20220801.00.00', - } + }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, }, # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) # See: https://github.com/zerodytrash/YouTube-Internal-Clients @@ -235,7 +238,17 @@ 'clientVersion': '2.0', }, }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 85 + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, + }, + # This client has pre-merged video+audio 720p/1080p streams + 'mediaconnect': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'MEDIA_CONNECT_FRONTEND', + 'clientVersion': '0.1', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 95, }, } @@ -248,11 +261,16 @@ def _split_innertube_client(client_name): return client_name, base, variant[0] if variant else None +def short_client_name(client_name): + main, *parts = _split_innertube_client(client_name)[0].replace('embedscreen', 'e_s').split('_') + return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() + + def build_innertube_clients(): THIRD_PARTY = { 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL } - BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb') + BASE_CLIENTS = ('ios', 'android', 'web', 'tv', 'mweb') priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): @@ -286,6 +304,7 @@ class BadgeType(enum.Enum): AVAILABILITY_PREMIUM = enum.auto() AVAILABILITY_SUBSCRIPTION = enum.auto() LIVE_NOW = enum.auto() + VERIFIED = enum.auto() class YoutubeBaseInfoExtractor(InfoExtractor): @@ -422,7 +441,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?piped\.adminforge\.de', r'(?:www\.)?watch\.whatevertinfoil\.de', r'(?:www\.)?piped\.qdi\.fi', - r'(?:www\.)?piped\.video', + r'(?:(?:www|cf)\.)?piped\.video', r'(?:www\.)?piped\.aeong\.one', r'(?:www\.)?piped\.moomoo\.me', r'(?:www\.)?piped\.chauvet\.pro', @@ -430,6 +449,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?pd\.vern\.cc', r'(?:www\.)?piped\.hostux\.net', r'(?:www\.)?piped\.lunar\.icu', + # Hyperpipe instances from https://hyperpipe.codeberg.page/ + r'(?:www\.)?hyperpipe\.surge\.sh', + r'(?:www\.)?hyperpipe\.esmailelbob\.xyz', + r'(?:www\.)?listen\.whatever\.social', + r'(?:www\.)?music\.adminforge\.de', ) # extracted from account/account_menu ep @@ -441,10 +465,30 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi', 'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw', 'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml', - 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko' + 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko', ] - _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + _IGNORED_WARNINGS = { + 'Unavailable videos will be hidden during playback', + 'Unavailable videos are hidden', + } + + _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en + _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' + + def ucid_or_none(self, ucid): + return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) + + def handle_or_none(self, handle): + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None) + + def handle_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', + url, 'channel handle', default=None) + + def ucid_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', + url, 'channel id', default=None) @functools.cached_property def _preferred_lang(self): @@ -468,16 +512,10 @@ def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): return - consent_id = None - consent = cookies.get('CONSENT') - if consent: - if 'YES' in consent.value: - return - consent_id = self._search_regex( - r'PENDING\+(\d+)', consent.value, 'consent', default=None) - if not consent_id: - consent_id = random.randint(100, 999) - self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + socs = cookies.get('SOCS') + if socs and not socs.value.startswith('CAA'): # not consented + return + self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) def _initialize_pref(self): cookies = self._get_cookies('https://www.youtube.com/') @@ -663,7 +701,7 @@ def generate_api_headers( 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), - 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client) + 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client), } if session_index is None: session_index = self._extract_session_index(ytcfg) @@ -680,7 +718,7 @@ def _download_ytcfg(self, client, video_id): url = { 'web': 'https://www.youtube.com', 'web_music': 'https://music.youtube.com', - 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1' + 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', }.get(client) if not url: return {} @@ -691,7 +729,7 @@ def _download_ytcfg(self, client, video_id): @staticmethod def _build_api_continuation_query(continuation, ctp=None): query = { - 'continuation': continuation + 'continuation': continuation, } # TODO: Inconsistency with clickTrackingParams. # Currently we have a fixed ctp contained within context (from ytcfg) @@ -731,7 +769,7 @@ def _extract_continuation(cls, renderer): return traverse_obj(renderer, ( ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', - ('continuationEndpoint', ('button', 'buttonRenderer', 'command')) + ('continuationEndpoint', ('button', 'buttonRenderer', 'command')), ), get_all=False, expected_type=cls._extract_continuation_ep_data) @classmethod @@ -758,22 +796,31 @@ def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): for alert_type, alert_message in (warnings + errors[:-1]): self.report_warning(f'YouTube said: {alert_type} - {alert_message}', only_once=only_once) if errors: - raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected) + raise ExtractorError(f'YouTube said: {errors[-1][1]}', expected=expected) def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) - def _extract_badges(self, renderer: dict): - privacy_icon_map = { + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, - 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, + 'CHECK': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, - 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, + 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED, } label_map = { @@ -781,13 +828,15 @@ def _extract_badges(self, renderer: dict): 'private': BadgeType.AVAILABILITY_PRIVATE, 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, - 'premium': BadgeType.AVAILABILITY_PREMIUM + 'premium': BadgeType.AVAILABILITY_PREMIUM, + 'verified': BadgeType.VERIFIED, + 'official artist channel': BadgeType.VERIFIED, } badges = [] - for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer')): + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): badge_type = ( - privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) or badge_style_map.get(traverse_obj(badge, 'style')) ) if badge_type: @@ -795,11 +844,12 @@ def _extract_badges(self, renderer: dict): continue # fallback, won't work in some languages - label = traverse_obj(badge, 'label', expected_type=str, default='') + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') for match, label_badge_type in label_map.items(): if match in label.lower(): - badges.append({'type': badge_type}) - continue + badges.append({'type': label_badge_type}) + break return badges @@ -838,14 +888,14 @@ def _get_count(self, data, *path_list): return count @staticmethod - def _extract_thumbnails(data, *path_list): + def _extract_thumbnails(data, *path_list, final_key='thumbnails'): """ Extract thumbnails from thumbnails dict @param path_list: path list to level that contains 'thumbnails' key """ thumbnails = [] for path in path_list or [()]: - for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)): + for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)): thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue @@ -865,25 +915,32 @@ def _extract_thumbnails(data, *path_list): def extract_relative_time(relative_time_text): """ Extracts a relative time from string and converts to dt object - e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' """ - mobj = re.search(r'(?Ptoday|yesterday|now)|(?P