X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/fcbc9ed760be6e3455bbadfaf277b4504b06f068..a25a424323267e3f6f9f63c0b62df499bd7b8d46:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 967914c0f..1f1db1ad3 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -11,17 +11,18 @@ import os.path import random import re +import shlex import sys import threading import time import traceback -import urllib.error import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper from ..compat import functools from ..jsinterp import JSInterpreter +from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, ExtractorError, @@ -32,6 +33,7 @@ clean_html, datetime_from_str, dict_get, + filesize_from_tbr, filter_dict, float_or_none, format_field, @@ -41,7 +43,6 @@ join_nonempty, js_to_json, mimetype2ext, - network_exceptions, orderedSet, parse_codecs, parse_count, @@ -55,6 +56,7 @@ str_to_int, strftime_or_none, traverse_obj, + try_call, try_get, unescapeHTML, unified_strdate, @@ -115,9 +117,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '17.31.35', + 'clientVersion': '19.09.37', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' + 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, @@ -128,9 +130,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '17.31.35', + 'clientVersion': '19.09.37', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' + 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, @@ -141,9 +143,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '5.16.51', + 'clientVersion': '6.42.52', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.music/5.16.51 (Linux; U; Android 11) gzip' + 'userAgent': 'com.google.android.apps.youtube.music/6.42.52 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, @@ -169,9 +171,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '17.33.2', + 'clientVersion': '19.09.3', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, @@ -181,9 +183,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '17.33.2', + 'clientVersion': '19.09.3', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, @@ -194,9 +196,9 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '5.21', + 'clientVersion': '6.33.3', 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtubemusic/5.21 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' + 'userAgent': 'com.google.ios.youtubemusic/6.33.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -429,7 +431,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?piped\.adminforge\.de', r'(?:www\.)?watch\.whatevertinfoil\.de', r'(?:www\.)?piped\.qdi\.fi', - r'(?:www\.)?piped\.video', + r'(?:(?:www|cf)\.)?piped\.video', r'(?:www\.)?piped\.aeong\.one', r'(?:www\.)?piped\.moomoo\.me', r'(?:www\.)?piped\.chauvet\.pro', @@ -497,16 +499,10 @@ def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): return - consent_id = None - consent = cookies.get('CONSENT') - if consent: - if 'YES' in consent.value: - return - consent_id = self._search_regex( - r'PENDING\+(\d+)', consent.value, 'consent', default=None) - if not consent_id: - consent_id = random.randint(100, 999) - self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + socs = cookies.get('SOCS') + if socs and not socs.value.startswith('CAA'): # not consented + return + self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) def _initialize_pref(self): cookies = self._get_cookies('https://www.youtube.com/') @@ -909,7 +905,7 @@ def extract_relative_time(relative_time_text): e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' """ - # XXX: this could be moved to a general function in utils.py + # XXX: this could be moved to a general function in utils/_utils.py # The relative time text strings are roughly the same as what # Javascript's Intl.RelativeTimeFormat function generates. # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat @@ -948,7 +944,16 @@ def _parse_time_text(self, text): def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): - for retry in self.RetryManager(): + raise_for_incomplete = bool(self._configuration_arg('raise_incomplete_data', ie_key=YoutubeIE)) + # Incomplete Data should be a warning by default when retries are exhausted, while other errors should be fatal. + icd_retries = iter(self.RetryManager(fatal=raise_for_incomplete)) + icd_rm = next(icd_retries) + main_retries = iter(self.RetryManager()) + main_rm = next(main_retries) + # Manual retry loop for multiple RetryManagers + # The proper RetryManager MUST be advanced after an error + # and its result MUST be checked if the manager is non fatal + while True: try: response = self._call_api( ep=ep, fatal=True, headers=headers, @@ -959,40 +964,46 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers except ExtractorError as e: if not isinstance(e.cause, network_exceptions): return self._error_or_warning(e, fatal=fatal) - elif not isinstance(e.cause, urllib.error.HTTPError): - retry.error = e + elif not isinstance(e.cause, HTTPError): + main_rm.error = e + next(main_retries) continue - first_bytes = e.cause.read(512) + first_bytes = e.cause.response.read(512) if not is_html(first_bytes): yt_error = try_get( self._parse_json( - self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + self._webpage_read_content(e.cause.response, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), lambda x: x['error']['message'], str) if yt_error: self._report_alerts([('ERROR', yt_error)], fatal=False) # Downloading page may result in intermittent 5xx HTTP error - # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + # Sometimes a 404 is also received. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 - if e.cause.code not in (403, 429): - retry.error = e + if e.cause.status not in (403, 429): + main_rm.error = e + next(main_retries) continue return self._error_or_warning(e, fatal=fatal) try: self._extract_and_report_alerts(response, only_once=True) except ExtractorError as e: - # YouTube servers may return errors we want to retry on in a 200 OK response + # YouTube's servers may return errors we want to retry on in a 200 OK response # See: https://github.com/yt-dlp/yt-dlp/issues/839 if 'unknown error' in e.msg.lower(): - retry.error = e + main_rm.error = e + next(main_retries) continue return self._error_or_warning(e, fatal=fatal) # Youtube sometimes sends incomplete data # See: https://github.com/ytdl-org/youtube-dl/issues/28194 if not traverse_obj(response, *variadic(check_get_keys)): - retry.error = ExtractorError('Incomplete data received', expected=True) + icd_rm.error = ExtractorError('Incomplete data received', expected=True) + should_retry = next(icd_retries, None) + if not should_retry: + return None continue return response @@ -2060,11 +2071,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Voyeur Girl', 'description': 'md5:7ae382a65843d6df2685993e90a8628f', 'upload_date': '20190312', - 'artist': 'Stephen', + 'artists': ['Stephen'], + 'creators': ['Stephen'], 'track': 'Voyeur Girl', 'album': 'it\'s too much love to know my dear', 'release_date': '20190313', - 'release_year': 2019, 'alt_title': 'Voyeur Girl', 'view_count': int, 'playable_in_embed': True, @@ -2074,7 +2085,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel': 'Stephen', # TODO: should be "Stephen - Topic" 'uploader': 'Stephen', 'availability': 'public', - 'creator': 'Stephen', 'duration': 169, 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp', 'age_limit': 0, @@ -2499,29 +2509,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@abaointokyo', }, 'params': {'skip_download': True} - }, { - # Story. Requires specific player params to work. - 'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI', - 'info_dict': { - 'id': 'vv8qTUWmulI', - 'ext': 'mp4', - 'availability': 'unlisted', - 'view_count': int, - 'channel_id': 'UCzIZ8HrzDgc-pNQDUG6avBA', - 'upload_date': '20220526', - 'categories': ['Education'], - 'title': 'Story', - 'channel': 'IT\'S HISTORY', - 'description': '', - 'duration': 12, - 'playable_in_embed': True, - 'age_limit': 0, - 'live_status': 'not_live', - 'tags': [], - 'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp', - 'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA', - }, - 'skip': 'stories get removed after some period of time', }, { 'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA', 'info_dict': { @@ -2860,7 +2847,7 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate): # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403 + expire_fast = immediate or last_error and isinstance(last_error, HTTPError) and last_error.status == 403 mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -3140,7 +3127,7 @@ def _extract_n_function_name(self, jscode): return funcname return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])[,;]', jscode, + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] def _extract_n_function_code(self, video_id, player_url): @@ -3310,16 +3297,15 @@ def _extract_chapters_from_engagement_panel(self, data, duration): chapter_time, chapter_title, duration) for contents in content_list)), []) - def _extract_heatmap_from_player_overlay(self, data): - content_list = traverse_obj(data, ( - 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', - 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) - return next(filter(None, ( - traverse_obj(contents, (..., 'heatMarkerRenderer', { - 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), - 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, - 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), - })) for contents in content_list)), None) + def _extract_heatmap(self, data): + return traverse_obj(data, ( + 'frameworkUpdates', 'entityBatchUpdate', 'mutations', + lambda _, v: v['payload']['macroMarkersListEntity']['markersList']['markerType'] == 'MARKER_TYPE_HEATMAP', + 'payload', 'macroMarkersListEntity', 'markersList', 'markers', ..., { + 'start_time': ('startMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (int(x['startMillis']) + int(x['durationMillis'])) / 1000}, + 'value': ('intensityScoreNormalized', {float_or_none}), + })) or None def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -3426,7 +3412,9 @@ def extract_thread(contents): # Pinned comments may appear a second time in newest first sort # See: https://github.com/yt-dlp/yt-dlp/issues/6712 continue - self.report_warning('Detected YouTube comments looping. Stopping comment extraction as we probably cannot get any more.') + self.report_warning( + 'Detected YouTube comments looping. Stopping comment extraction ' + f'{"for this thread" if parent else ""} as we probably cannot get any more.') yield else: tracker['seen_comment_ids'].add(comment['id']) @@ -3517,12 +3505,18 @@ def extract_thread(contents): # Ignore incomplete data error for replies if retries didn't work. # This is to allow any other parent comments and comment threads to be downloaded. # See: https://github.com/yt-dlp/yt-dlp/issues/4669 - if 'incomplete data' in str(e).lower() and parent and self.get_param('ignoreerrors') is True: - self.report_warning( - 'Received incomplete data for a comment reply thread and retrying did not help. ' - 'Ignoring to let other comments be downloaded.') - else: - raise + if 'incomplete data' in str(e).lower() and parent: + if self.get_param('ignoreerrors') in (True, 'only_download'): + self.report_warning( + 'Received incomplete data for a comment reply thread and retrying did not help. ' + 'Ignoring to let other comments be downloaded. Pass --no-ignore-errors to not ignore.') + return + else: + raise ExtractorError( + 'Incomplete data received for comment reply thread. ' + 'Pass --ignore-errors to ignore and allow rest of comments to download.', + expected=True) + raise is_forced_continuation = False continuation = None for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): @@ -3599,8 +3593,6 @@ def _is_agegated(player_response): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - _PLAYER_PARAMS = 'CgIQBg==' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): session_index = self._extract_session_index(player_ytcfg, master_ytcfg) @@ -3612,8 +3604,12 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, } - if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': - yt_query['params'] = self._PLAYER_PARAMS + if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'): + yt_query['params'] = 'CgIIAQ==' + + pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] + if pp_arg: + yt_query['params'] = pp_arg yt_query.update(self._generate_player_context(sts)) return self._extract_response( @@ -3647,15 +3643,28 @@ def _get_requested_clients(self, url, smuggled_data): return orderedSet(requested_clients) + def _invalid_player_response(self, pr, video_id): + # YouTube may return a different video player response than expected. + # See: https://github.com/TeamNewPipe/NewPipe/issues/8713 + if (pr_id := traverse_obj(pr, ('videoDetails', 'videoId'))) != video_id: + return pr_id + def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): initial_pr = None if webpage: initial_pr = self._search_json( self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) + prs = [] + if initial_pr and not self._invalid_player_response(initial_pr, video_id): + # Android player_response does not have microFormats which are needed for + # extraction of some data. So we return the initial_pr with formats + # stripped out even if not requested by the user + # See: https://github.com/yt-dlp/yt-dlp/issues/501 + prs.append({**initial_pr, 'streamingData': None}) + all_clients = set(clients) clients = clients[::-1] - prs = [] def append_client(*client_names): """ Append the first client name that exists but not already used """ @@ -3667,18 +3676,9 @@ def append_client(*client_names): all_clients.add(actual_client) return - # Android player_response does not have microFormats which are needed for - # extraction of some data. So we return the initial_pr with formats - # stripped out even if not requested by the user - # See: https://github.com/yt-dlp/yt-dlp/issues/501 - if initial_pr: - pr = dict(initial_pr) - pr['streamingData'] = None - prs.append(pr) - - last_error = None tried_iframe_fallback = False player_url = None + skipped_clients = {} while clients: client, base_client, variant = _split_innertube_client(clients.pop()) player_ytcfg = master_ytcfg if client == 'web' else {} @@ -3699,26 +3699,19 @@ def append_client(*client_names): pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr, smuggled_data) except ExtractorError as e: - if last_error: - self.report_warning(last_error) - last_error = e + self.report_warning(e) continue - if pr: - # YouTube may return a different video player response than expected. - # See: https://github.com/TeamNewPipe/NewPipe/issues/8713 - pr_video_id = traverse_obj(pr, ('videoDetails', 'videoId')) - if pr_video_id and pr_video_id != video_id: - self.report_warning( - f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message()) - else: - # Save client name for introspection later - name = short_client_name(client) - sd = traverse_obj(pr, ('streamingData', {dict})) or {} - sd[STREAMING_DATA_CLIENT_NAME] = name - for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): - f[STREAMING_DATA_CLIENT_NAME] = name - prs.append(pr) + if pr_id := self._invalid_player_response(pr, video_id): + skipped_clients[client] = pr_id + elif pr: + # Save client name for introspection later + name = short_client_name(client) + sd = traverse_obj(pr, ('streamingData', {dict})) or {} + sd[STREAMING_DATA_CLIENT_NAME] = name + for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): + f[STREAMING_DATA_CLIENT_NAME] = name + prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: @@ -3729,10 +3722,15 @@ def append_client(*client_names): elif not variant: append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded') - if last_error: - if not len(prs): - raise last_error - self.report_warning(last_error) + if skipped_clients: + self.report_warning( + f'Skipping player responses from {"/".join(skipped_clients)} clients ' + f'(got player responses for video "{"/".join(set(skipped_clients.values()))}" instead of "{video_id}")') + if not prs: + raise ExtractorError( + 'All player responses are invalid. Your IP is likely being blocked by Youtube', expected=True) + elif not prs: + raise ExtractorError('Failed to extract any player response') return prs, player_url def _needs_live_processing(self, live_status, duration): @@ -3843,11 +3841,12 @@ def build_fragments(f): 10 if audio_track.get('audioIsDefault') and 10 else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 else -1) + format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)})) # Some formats may have much smaller duration than others (possibly damaged during encoding) # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 # Make sure to avoid false positives with small duration differences. # E.g. __2ABJjxzNo, ySuUZEjARPY - is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) + is_damaged = try_call(lambda: format_duration < duration // 2) if is_damaged: self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) @@ -3877,6 +3876,7 @@ def build_fragments(f): 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, + 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(audio_track.get('id', '').split('.')[0], @@ -3942,9 +3942,12 @@ def process_manifest_format(f, proto, client_name, itag): elif itag: f['format_id'] = itag + if f.get('source_preference') is None: + f['source_preference'] = -1 + if itag in ('616', '235'): f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') - f['source_preference'] = (f.get('source_preference') or -1) + 100 + f['source_preference'] += 100 f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) if f['quality'] == -1 and f.get('height'): @@ -3953,6 +3956,10 @@ def process_manifest_format(f, proto, client_name, itag): f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') if f.get('fps') and f['fps'] <= 1: del f['fps'] + + if proto == 'hls' and f.get('has_drm'): + f['has_drm'] = 'maybe' + f['source_preference'] -= 5 return True subtitles = {} @@ -4025,8 +4032,9 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - if smuggled_data.get('is_story'): # XXX: Deprecated - query['pp'] = self._PLAYER_PARAMS + pp = self._configuration_arg('player_params', [None], casesense=True)[0] + if pp: + query['pp'] = pp webpage = self._download_webpage( webpage_url, video_id, fatal=False, query=query) @@ -4054,6 +4062,10 @@ def _list_formats(self, video_id, microformats, video_details, player_responses, else None) streaming_data = traverse_obj(player_responses, (..., 'streamingData')) *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) + if all(f.get('has_drm') for f in formats): + # If there are no formats that definitely don't have DRM, all have DRM + for f in formats: + f['has_drm'] = True return live_broadcast_details, live_status, streaming_data, formats, subtitles @@ -4381,7 +4393,8 @@ def process_language(container, base_url, lang_code, sub_name, query): release_year = release_date[:4] info.update({ 'album': mobj.group('album'.strip()), - 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), + 'artists': ([a] if (a := mobj.group('clean_artist')) + else [a.strip() for a in mobj.group('artist').split('·')]), 'track': mobj.group('track').strip(), 'release_date': release_date, 'release_year': int_or_none(release_year), @@ -4431,7 +4444,7 @@ def process_language(container, base_url, lang_code, sub_name, query): or self._extract_chapters_from_description(video_description, duration) or None) - info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + info['heatmap'] = self._extract_heatmap(initial_data) contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), @@ -4475,14 +4488,13 @@ def process_language(container, base_url, lang_code, sub_name, query): if mobj: info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) break - sbr_tooltip = try_get( - vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) - if sbr_tooltip: - like_count, dislike_count = sbr_tooltip.split(' / ') - info.update({ - 'like_count': str_to_int(like_count), - 'dislike_count': str_to_int(dislike_count), - }) + + info['like_count'] = traverse_obj(vpir, ( + 'videoActions', 'menuRenderer', 'topLevelButtons', ..., + 'segmentedLikeDislikeButtonViewModel', 'likeButtonViewModel', 'likeButtonViewModel', + 'toggleButtonViewModel', 'toggleButtonViewModel', 'defaultButtonViewModel', + 'buttonViewModel', 'accessibilityText', {parse_count}), get_all=False) + vcr = traverse_obj(vpir, ('viewCount', 'videoViewCountRenderer')) if vcr: vc = self._get_count(vcr, 'viewCount') @@ -4528,7 +4540,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if mrr_title == 'Album': info['album'] = mrr_contents_text elif mrr_title == 'Artist': - info['artist'] = mrr_contents_text + info['artists'] = [mrr_contents_text] if mrr_contents_text else None elif mrr_title == 'Song': info['track'] = mrr_contents_text owner_badges = self._extract_badges(traverse_obj(vsir, ('owner', 'videoOwnerRenderer', 'badges'))) @@ -4554,7 +4566,15 @@ def process_language(container, base_url, lang_code, sub_name, query): self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date info['upload_date'] = upload_date - for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: + if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'): + # Newly uploaded videos' HLS formats are potentially problematic and need to be checked + upload_datetime = datetime_from_str(upload_date).replace(tzinfo=datetime.timezone.utc) + if upload_datetime >= datetime_from_str('today-2days'): + for fmt in info['formats']: + if fmt.get('protocol') == 'm3u8_native': + fmt['__needs_testing'] = True + + for s_k, d_k in [('artists', 'creators'), ('track', 'alt_title')]: v = info.get(s_k) if v: info[d_k] = v @@ -4927,10 +4947,15 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) yield from extract_entries(parent_renderer) continuation = continuation_list[0] - + seen_continuations = set() for page_num in itertools.count(1): if not continuation: break + continuation_token = continuation.get('continuation') + if continuation_token is not None and continuation_token in seen_continuations: + self.write_debug('Detected YouTube feed looping - assuming end of feed.') + break + seen_continuations.add(continuation_token) headers = self.generate_api_headers( ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data) response = self._extract_response( @@ -5067,7 +5092,8 @@ def _get_uncropped(url): 'availability': self._extract_availability(data), 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), 'description': try_get(metadata_renderer, lambda x: x.get('description', '')), - 'tags': try_get(metadata_renderer or {}, lambda x: x.get('keywords', '').split()), + 'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str})) + or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))), 'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners, }) @@ -5264,7 +5290,7 @@ def _extract_webpage(self, url, item_id, fatal=True): data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (403, 429): retry.error = e continue self._error_or_warning(e, fatal=fatal) @@ -5280,6 +5306,7 @@ def _extract_webpage(self, url, item_id, fatal=True): # See: https://github.com/yt-dlp/yt-dlp/issues/116 if not traverse_obj(data, 'contents', 'currentVideoEndpoint', 'onResponseReceivedActions'): retry.error = ExtractorError('Incomplete yt initial data received') + data = None continue return webpage, data @@ -5399,14 +5426,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader': 'Igor Kleiner', + 'title': 'Igor Kleiner Ph.D. - Playlists', + 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', + 'uploader': 'Igor Kleiner Ph.D.', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'channel': 'Igor Kleiner', + 'channel': 'Igor Kleiner Ph.D.', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'], + 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int }, @@ -5416,14 +5443,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader': 'Igor Kleiner', + 'title': 'Igor Kleiner Ph.D. - Playlists', + 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', + 'uploader': 'Igor Kleiner Ph.D.', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'], + 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'channel': 'Igor Kleiner', + 'channel': 'Igor Kleiner Ph.D.', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int }, @@ -5434,7 +5461,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Playlists', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'channel': '3Blue1Brown', 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', @@ -5458,7 +5485,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@ThirstForScience', 'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', - 'tags': 'count:13', + 'tags': 'count:12', 'channel': 'ThirstForScience', 'channel_follower_count': int } @@ -5493,10 +5520,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'tags': [], 'channel': 'Sergey M.', 'description': '', - 'modified_date': '20160902', + 'modified_date': '20230921', 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'availability': 'public', + 'availability': 'unlisted', 'uploader_url': 'https://www.youtube.com/@sergeym.6173', 'uploader_id': '@sergeym.6173', 'uploader': 'Sergey M.', @@ -5611,7 +5638,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'tags': ['Mathematics'], 'channel': '3Blue1Brown', @@ -5880,7 +5907,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/hashtag/cctv9', 'info_dict': { 'id': 'cctv9', - 'title': '#cctv9', + 'title': 'cctv9 - All', 'tags': [], }, 'playlist_mincount': 300, # not consistent but should be over 300 @@ -6158,12 +6185,13 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_follower_count': int, 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'description': 'md5:e56b74b5bb7e9c701522162e9abfb822', + 'description': 'md5:49809d8bf9da539bc48ed5d1f83c33f2', 'channel': 'Polka Ch. 尾丸ポルカ', 'tags': 'count:35', 'uploader_url': 'https://www.youtube.com/@OmaruPolka', 'uploader': 'Polka Ch. 尾丸ポルカ', 'uploader_id': '@OmaruPolka', + 'channel_is_verified': True, }, 'playlist_count': 3, }, { @@ -6173,15 +6201,16 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'id': 'UC0intLFzLaudFG-xAvUEO-A', 'title': 'Not Just Bikes - Shorts', - 'tags': 'count:12', + 'tags': 'count:10', 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', - 'description': 'md5:26bc55af26855a608a5cf89dfa595c8d', + 'description': 'md5:5e82545b3a041345927a92d0585df247', 'channel_follower_count': int, 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', 'channel': 'Not Just Bikes', 'uploader_url': 'https://www.youtube.com/@NotJustBikes', 'uploader': 'Not Just Bikes', 'uploader_id': '@NotJustBikes', + 'channel_is_verified': True, }, 'playlist_mincount': 10, }, { @@ -6341,15 +6370,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/@3blue1brown/about', 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'id': '@3blue1brown', 'tags': ['Mathematics'], - 'title': '3Blue1Brown - About', + 'title': '3Blue1Brown', 'channel_follower_count': int, 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', 'channel': '3Blue1Brown', - 'view_count': int, 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', @@ -6372,7 +6400,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': '99 Percent Invisible', 'uploader_id': '@99percentinvisiblepodcast', }, - 'playlist_count': 1, + 'playlist_count': 0, }, { # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) 'url': 'https://www.youtube.com/@AHimitsu/releases', @@ -6384,7 +6412,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@AHimitsu', 'uploader': 'A Himitsu', 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A', - 'tags': 'count:16', + 'tags': 'count:12', 'description': 'I make music', 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', 'channel_follower_count': int, @@ -6408,11 +6436,32 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': 'Bangy Shorts', 'tags': [], 'availability': 'public', - 'modified_date': '20230626', + 'modified_date': r're:\d{8}', 'title': 'Uploads from Bangy Shorts', }, 'playlist_mincount': 100, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'Tags containing spaces', + 'url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', + 'playlist_count': 3, + 'info_dict': { + 'id': 'UC7_YxT-KID8kRbqZo7MyscQ', + 'channel': 'Markiplier', + 'channel_id': 'UC7_YxT-KID8kRbqZo7MyscQ', + 'title': 'Markiplier', + 'channel_follower_count': int, + 'description': 'md5:0c010910558658824402809750dc5d97', + 'uploader_id': '@markiplier', + 'uploader_url': 'https://www.youtube.com/@markiplier', + 'uploader': 'Markiplier', + 'channel_url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', + 'channel_is_verified': True, + 'tags': ['markiplier', 'comedy', 'gaming', 'funny videos', 'funny moments', + 'sketch comedy', 'laughing', 'lets play', 'challenge videos', 'hilarious', + 'challenges', 'sketches', 'scary games', 'funny games', 'rage games', + 'mark fischbach'], + }, }] @classmethod @@ -6451,6 +6500,9 @@ def _extract_tab_id_and_name(self, tab, base_url='https://www.youtube.com'): def _has_tab(self, tabs, tab_id): return any(self._extract_tab_id_and_name(tab)[0] == tab_id for tab in tabs) + def _empty_playlist(self, item_id, data): + return self.playlist_result([], item_id, **self._extract_metadata_from_tabs(item_id, data)) + @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data def _real_extract(self, url, smuggled_data): item_id = self._match_id(url) @@ -6516,6 +6568,10 @@ def _real_extract(self, url, smuggled_data): selected_tab_id, selected_tab_name = self._extract_tab_id_and_name(selected_tab, url) # NB: Name may be translated self.write_debug(f'Selected tab: {selected_tab_id!r} ({selected_tab_name}), Requested tab: {original_tab_id!r}') + # /about is no longer a tab + if original_tab_id == 'about': + return self._empty_playlist(item_id, data) + if not original_tab_id and selected_tab_name: self.to_screen('Downloading all uploads of the channel. ' 'To download only the videos in a specific tab, pass the tab\'s URL') @@ -6528,7 +6584,7 @@ def _real_extract(self, url, smuggled_data): if not extra_tabs and selected_tab_id != 'videos': # Channel does not have streams, shorts or videos tabs if item_id[:2] != 'UC': - raise ExtractorError('This channel has no uploads', expected=True) + return self._empty_playlist(item_id, data) # Topic channels don't have /videos. Use the equivalent playlist instead pl_id = f'UU{item_id[2:]}' @@ -6536,7 +6592,7 @@ def _real_extract(self, url, smuggled_data): try: data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True) except ExtractorError: - raise ExtractorError('This channel has no uploads', expected=True) + return self._empty_playlist(item_id, data) else: item_id, url = pl_id, pl_url self.to_screen( @@ -6668,7 +6724,7 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader_url': 'https://www.youtube.com/@milan5503', 'availability': 'public', }, - 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden'], + 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden', 'Retrying', 'Giving up'], }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'playlist_mincount': 455, @@ -6913,7 +6969,7 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): IE_DESC = 'YouTube search' IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only + _SEARCH_PARAMS = 'EgIQAfABAQ==' # Videos only _TESTS = [{ 'url': 'ytsearch5:youtube-dl test video', 'playlist_count': 5, @@ -6921,6 +6977,14 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', } + }, { + 'note': 'Suicide/self-harm search warning', + 'url': 'ytsearch1:i hate myself and i wanna die', + 'playlist_count': 1, + 'info_dict': { + 'id': 'i hate myself and i wanna die', + 'title': 'i hate myself and i wanna die', + } }] @@ -6928,7 +6992,7 @@ class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube search, newest videos first' - _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date + _SEARCH_PARAMS = 'CAISAhAB8AEB' # Videos only, sorted by date _TESTS = [{ 'url': 'ytsearchdate5:youtube-dl test video', 'playlist_count': 5, @@ -7137,22 +7201,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): }] -class YoutubeStoriesIE(InfoExtractor): - IE_DESC = 'YouTube channel stories; "ytstories:" prefix' - IE_NAME = 'youtube:stories' - _VALID_URL = r'ytstories:UC(?P[A-Za-z0-9_-]{21}[AQgw])$' - _TESTS = [{ - 'url': 'ytstories:UCwFCb4jeqaKWnciAYM-ZVHg', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = f'RLTD{self._match_id(url)}' - return self.url_result( - smuggle_url(f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', {'is_story': True}), - ie=YoutubeTabIE, video_id=playlist_id) - - class YoutubeShortsAudioPivotIE(InfoExtractor): IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' IE_NAME = 'youtube:shorts:pivot:audio'