X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/84bbc54599d7669cc8a08a9f4251bb60ec419b53..b440e1bb2211918ef2b34138a65e0cb6c3a66057:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 61804e2af..636bf42b6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -225,28 +225,28 @@ def get_first(obj, keys, **kwargs): def build_innertube_clients(): - third_party = { + THIRD_PARTY = { 'embedUrl': 'https://google.com', # Can be any valid URL } - base_clients = ('android', 'web', 'ios', 'mweb') - priority = qualities(base_clients[::-1]) + BASE_CLIENTS = ('android', 'web', 'ios', 'mweb') + priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') - ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) - if client in base_clients: - INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) + base_client, *variant = client.split('_') + ytcfg['priority'] = 10 * priority(base_client) + + if variant == ['embedded']: + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY + INNERTUBE_CLIENTS[f'{base_client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party agegate_ytcfg['priority'] -= 1 - elif client.endswith('_embedded'): - ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party ytcfg['priority'] -= 2 - else: + elif variant: ytcfg['priority'] -= 3 @@ -844,7 +844,7 @@ def _extract_video(self, renderer): 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, - # 'upload_date': strftime_or_none(timestamp, '%Y%m%d'), + 'upload_date': strftime_or_none(timestamp, '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None, 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges @@ -2245,12 +2245,7 @@ def _extract_player_url(self, *ytcfgs, webpage=None): get_all=False, expected_type=compat_str) if not player_url: return - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - return player_url + return urljoin('https://www.youtube.com', player_url) def _download_player_url(self, video_id, fatal=False): res = self._download_webpage( @@ -2399,11 +2394,7 @@ def _decrypt_nsig(self, s, video_id, player_url): """Turn the encrypted n field into a working signature""" if player_url is None: raise ExtractorError('Cannot decrypt nsig without player_url') - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) + player_url = urljoin('https://www.youtube.com', player_url) sig_id = ('nsig_value', s) if sig_id in self._player_cache: @@ -2422,12 +2413,12 @@ def _decrypt_nsig(self, s, video_id, player_url): def _extract_n_function_name(self, jscode): nfunc, idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]{3})(?:\[(?P\d+)\])?\([a-zA-Z0-9]\)', + r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]+)(?:\[(?P\d+)\])?\([a-zA-Z0-9]\)', jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) if not idx: return nfunc return json.loads(js_to_json(self._search_regex( - rf'var {nfunc}\s*=\s*(\[.+?\]);', jscode, + rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode, f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)] def _extract_n_function(self, video_id, player_url): @@ -2945,6 +2936,7 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live): 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' ]) streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[]) + approx_duration = max(traverse_obj(streaming_formats, (..., 'approxDurationMs'), expected_type=float_or_none) or [0]) or None for fmt in streaming_formats: if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): @@ -3004,17 +2996,23 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live): itags[itag] = 'https' stream_ids.append(stream_id) - tbr = float_or_none( - fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + language_preference = ( + 10 if audio_track.get('audioIsDefault') and 10 + else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 + else -1) + # Some formats may have much smaller duration than others (possibly damaged during encoding) + # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 + is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) < approx_duration - 10000) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': itag, 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', - ' (default)' if audio_track.get('audioIsDefault') else ''), + ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), - throttled and 'THROTTLED', delim=', '), + throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), 'source_preference': -10 if throttled else -1, 'fps': int_or_none(fmt.get('fps')) or None, 'height': height, @@ -3022,8 +3020,10 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live): 'tbr': tbr, 'url': fmt_url, 'width': int_or_none(fmt.get('width')), - 'language': audio_track.get('id', '').split('.')[0], - 'language_preference': 1 if audio_track.get('audioIsDefault') else -1, + 'language': join_nonempty(audio_track.get('id', '').split('.')[0], + 'desc' if language_preference < -1 else ''), + 'language_preference': language_preference, + 'preference': -10 if is_damaged else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -3383,7 +3383,7 @@ def process_language(container, base_url, lang_code, sub_name, query): }) lang_subs.append({ 'ext': fmt, - 'url': update_url_query(base_url, query), + 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), 'name': sub_name, }) @@ -3408,6 +3408,9 @@ def process_language(container, base_url, lang_code, sub_name, query): trans_name += format_field(lang_name, template=' from %s') process_language( automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code}) + if lang_code == f'a-{trans_code}': + process_language( + automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {'tlang': trans_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles @@ -4241,6 +4244,16 @@ def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=Fals if 'webpage' not in self._configuration_arg('skip'): webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) + # Reject webpage data if redirected to home page without explicitly requesting + selected_tab = self._extract_selected_tab(traverse_obj( + data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[])) or {} + if (url != 'https://www.youtube.com/feed/recommended' + and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page + and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): + msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page' + if fatal: + raise ExtractorError(msg, expected=True) + self.report_warning(msg, only_once=True) if not data: if not ytcfg and self.is_authenticated: msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.'