X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/e0ddbd02bd1c365b95bb88eaa6e4e0238faf35eb..b440e1bb2211918ef2b34138a65e0cb6c3a66057:/yt_dlp/extractor/youtube.py diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index d8a63a3d2..636bf42b6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -42,6 +42,7 @@ int_or_none, is_html, join_nonempty, + js_to_json, mimetype2ext, network_exceptions, NO_DEFAULT, @@ -224,28 +225,28 @@ def get_first(obj, keys, **kwargs): def build_innertube_clients(): - third_party = { + THIRD_PARTY = { 'embedUrl': 'https://google.com', # Can be any valid URL } - base_clients = ('android', 'web', 'ios', 'mweb') - priority = qualities(base_clients[::-1]) + BASE_CLIENTS = ('android', 'web', 'ios', 'mweb') + priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') - ytcfg['priority'] = 10 * priority(client.split('_', 1)[0]) - if client in base_clients: - INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) + base_client, *variant = client.split('_') + ytcfg['priority'] = 10 * priority(base_client) + + if variant == ['embedded']: + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY + INNERTUBE_CLIENTS[f'{base_client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg) agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party agegate_ytcfg['priority'] -= 1 - elif client.endswith('_embedded'): - ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = third_party ytcfg['priority'] -= 2 - else: + elif variant: ytcfg['priority'] -= 3 @@ -257,7 +258,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' - r'shorts|movies|results|shared|hashtag|trending|feed|feeds|' + r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' r'browse|oembed|get_video_info|iframe_api|s/player|' r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') @@ -760,13 +761,15 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers note='%s%s' % (note, ' (retry #%d)' % count if count else '')) except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if isinstance(e.cause, compat_HTTPError) and not is_html(e.cause.read(512)): - e.cause.seek(0) - yt_error = try_get( - self._parse_json(e.cause.read().decode(), item_id, fatal=False), - lambda x: x['error']['message'], compat_str) - if yt_error: - self._report_alerts([('ERROR', yt_error)], fatal=False) + if isinstance(e.cause, compat_HTTPError): + first_bytes = e.cause.read(512) + if not is_html(first_bytes): + yt_error = try_get( + self._parse_json( + self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + lambda x: x['error']['message'], compat_str) + if yt_error: + self._report_alerts([('ERROR', yt_error)], fatal=False) # Downloading page may result in intermittent 5xx HTTP error # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome @@ -841,7 +844,7 @@ def _extract_video(self, renderer): 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, - # 'upload_date': strftime_or_none(timestamp, '%Y%m%d'), + 'upload_date': strftime_or_none(timestamp, '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None, 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges @@ -2242,12 +2245,7 @@ def _extract_player_url(self, *ytcfgs, webpage=None): get_all=False, expected_type=compat_str) if not player_url: return - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - return player_url + return urljoin('https://www.youtube.com', player_url) def _download_player_url(self, video_id, fatal=False): res = self._download_webpage( @@ -2396,11 +2394,7 @@ def _decrypt_nsig(self, s, video_id, player_url): """Turn the encrypted n field into a working signature""" if player_url is None: raise ExtractorError('Cannot decrypt nsig without player_url') - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) + player_url = urljoin('https://www.youtube.com', player_url) sig_id = ('nsig_value', s) if sig_id in self._player_cache: @@ -2418,9 +2412,14 @@ def _decrypt_nsig(self, s, video_id, player_url): raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) def _extract_n_function_name(self, jscode): - return self._search_regex( - (r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',), - jscode, 'Initial JS player n function name', group='nfunc') + nfunc, idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(?P[a-zA-Z0-9$]+)(?:\[(?P\d+)\])?\([a-zA-Z0-9]\)', + jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) + if not idx: + return nfunc + return json.loads(js_to_json(self._search_regex( + rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode, + f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)] def _extract_n_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) @@ -2937,6 +2936,7 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live): 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' ]) streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[]) + approx_duration = max(traverse_obj(streaming_formats, (..., 'approxDurationMs'), expected_type=float_or_none) or [0]) or None for fmt in streaming_formats: if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): @@ -2996,17 +2996,23 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live): itags[itag] = 'https' stream_ids.append(stream_id) - tbr = float_or_none( - fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + language_preference = ( + 10 if audio_track.get('audioIsDefault') and 10 + else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 + else -1) + # Some formats may have much smaller duration than others (possibly damaged during encoding) + # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 + is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) < approx_duration - 10000) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': itag, 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', - ' (default)' if audio_track.get('audioIsDefault') else ''), + ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), - throttled and 'THROTTLED', delim=', '), + throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), 'source_preference': -10 if throttled else -1, 'fps': int_or_none(fmt.get('fps')) or None, 'height': height, @@ -3014,8 +3020,10 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live): 'tbr': tbr, 'url': fmt_url, 'width': int_or_none(fmt.get('width')), - 'language': audio_track.get('id', '').split('.')[0], - 'language_preference': 1 if audio_track.get('audioIsDefault') else -1, + 'language': join_nonempty(audio_track.get('id', '').split('.')[0], + 'desc' if language_preference < -1 else ''), + 'language_preference': language_preference, + 'preference': -10 if is_damaged else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -3375,7 +3383,7 @@ def process_language(container, base_url, lang_code, sub_name, query): }) lang_subs.append({ 'ext': fmt, - 'url': update_url_query(base_url, query), + 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), 'name': sub_name, }) @@ -3400,6 +3408,9 @@ def process_language(container, base_url, lang_code, sub_name, query): trans_name += format_field(lang_name, template=' from %s') process_language( automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code}) + if lang_code == f'a-{trans_code}': + process_language( + automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {'tlang': trans_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles @@ -3593,6 +3604,26 @@ def process_language(container, base_url, lang_code, sub_name, query): class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): + @staticmethod + def passthrough_smuggled_data(func): + def _smuggle(entries, smuggled_data): + for entry in entries: + # TODO: Convert URL to music.youtube instead. + # Do we need to passthrough any other smuggled_data? + entry['url'] = smuggle_url(entry['url'], smuggled_data) + yield entry + + @functools.wraps(func) + def wrapper(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + if self.is_music_url(url): + smuggled_data['is_music_url'] = True + info_dict = func(self, url, smuggled_data) + if smuggled_data and info_dict.get('entries'): + info_dict['entries'] = _smuggle(info_dict['entries'], smuggled_data) + return info_dict + return wrapper + def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( 'channelId', webpage, 'channel id', default=None) @@ -3660,6 +3691,24 @@ def _grid_entries(self, grid_renderer): ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) break + def _music_reponsive_list_entry(self, renderer): + video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) + if video_id: + return self.url_result(f'https://music.youtube.com/watch?v={video_id}', + ie=YoutubeIE.ie_key(), video_id=video_id) + playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) + if playlist_id: + video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) + if video_id: + return self.url_result(f'https://music.youtube.com/watch?v={video_id}&list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + return self.url_result(f'https://music.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + browse_id = traverse_obj(renderer, ('navigationEndpoint', 'browseEndpoint', 'browseId')) + if browse_id: + return self.url_result(f'https://music.youtube.com/browse/{browse_id}', + ie=YoutubeTabIE.ie_key(), video_id=browse_id) + def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') if not isinstance(content, dict): @@ -3781,7 +3830,9 @@ def _extract_entries(self, parent_renderer, continuation_list): for content in contents: if not isinstance(content, dict): continue - is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) + is_renderer = traverse_obj( + content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation', + expected_type=dict) if not is_renderer: renderer = content.get('richItemRenderer') if renderer: @@ -3798,6 +3849,7 @@ def _extract_entries(self, parent_renderer, continuation_list): 'playlistVideoListRenderer': self._playlist_entries, 'gridRenderer': self._grid_entries, 'shelfRenderer': lambda x: self._shelf_entries(x), + 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)], 'backstagePostThreadRenderer': self._post_thread_entries, 'videoRenderer': lambda x: [self._video_entry(x)], 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), @@ -4192,6 +4244,16 @@ def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=Fals if 'webpage' not in self._configuration_arg('skip'): webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) + # Reject webpage data if redirected to home page without explicitly requesting + selected_tab = self._extract_selected_tab(traverse_obj( + data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[])) or {} + if (url != 'https://www.youtube.com/feed/recommended' + and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page + and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): + msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page' + if fatal: + raise ExtractorError(msg, expected=True) + self.report_warning(msg, only_once=True) if not data: if not ytcfg and self.is_authenticated: msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.' @@ -4222,33 +4284,32 @@ def _extract_tab_endpoint(self, url, item_id, ytcfg=None, fatal=True, default_cl raise ExtractorError(err_note, expected=True) self.report_warning(err_note, item_id) - @staticmethod - def _smuggle_data(entries, data): - for entry in entries: - if data: - entry['url'] = smuggle_url(entry['url'], data) - yield entry - _SEARCH_PARAMS = None - def _search_results(self, query, params=NO_DEFAULT): + def _search_results(self, query, params=NO_DEFAULT, default_client='web'): data = {'query': query} if params is NO_DEFAULT: params = self._SEARCH_PARAMS if params: data['params'] = params + + content_keys = ( + ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', 'sectionListRenderer', 'contents'), + ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', 'continuationItems'), + # ytmusic search + ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'), + ('continuationContents', ), + ) + check_get_keys = tuple(set(keys[0] for keys in content_keys)) + continuation_list = [None] for page_num in itertools.count(1): data.update(continuation_list[0] or {}) search = self._extract_response( item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, - check_get_keys=('contents', 'onResponseReceivedCommands')) - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - yield from self._extract_entries({'contents': slr_contents}, continuation_list) + default_client=default_client, check_get_keys=check_get_keys) + slr_contents = traverse_obj(search, *content_keys) + yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list) if not continuation_list[0]: break @@ -4925,18 +4986,10 @@ def suitable(cls, url): return False if YoutubeIE.suitable(url) else super( YoutubeTabIE, cls).suitable(url) - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - if self.is_music_url(url): - smuggled_data['is_music_url'] = True - info_dict = self.__real_extract(url, smuggled_data) - if info_dict.get('entries'): - info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data) - return info_dict - _URL_RE = re.compile(rf'(?P
{_VALID_URL})(?(not_channel)|(?P/\w+))?(?P.*)$')
 
-    def __real_extract(self, url, smuggled_data):
+    @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data
+    def _real_extract(self, url, smuggled_data):
         item_id = self._match_id(url)
         url = compat_urlparse.urlunparse(
             compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
@@ -5271,7 +5324,14 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
     IE_NAME = 'youtube:search'
     _SEARCH_KEY = 'ytsearch'
     _SEARCH_PARAMS = 'EgIQAQ%3D%3D'  # Videos only
-    _TESTS = []
+    _TESTS = [{
+        'url': 'ytsearch5:youtube-dl test video',
+        'playlist_count': 5,
+        'info_dict': {
+            'id': 'youtube-dl test video',
+            'title': 'youtube-dl test video',
+        }
+    }]
 
 
 class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
@@ -5279,12 +5339,20 @@ class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
     _SEARCH_KEY = 'ytsearchdate'
     IE_DESC = 'YouTube search, newest videos first'
     _SEARCH_PARAMS = 'CAISAhAB'  # Videos only, sorted by date
+    _TESTS = [{
+        'url': 'ytsearchdate5:youtube-dl test video',
+        'playlist_count': 5,
+        'info_dict': {
+            'id': 'youtube-dl test video',
+            'title': 'youtube-dl test video',
+        }
+    }]
 
 
 class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
     IE_DESC = 'YouTube search URLs with sorting and filter support'
     IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
-    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:results|search)\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
     _TESTS = [{
         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
         'playlist_mincount': 5,
@@ -5311,7 +5379,60 @@ def _real_extract(self, url):
         return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query)
 
 
-class YoutubeFeedsInfoExtractor(YoutubeTabIE):
+class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor):
+    IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)'
+    IE_NAME = 'youtube:music:search_url'
+    _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)'
+    _TESTS = [{
+        'url': 'https://music.youtube.com/search?q=royalty+free+music',
+        'playlist_count': 16,
+        'info_dict': {
+            'id': 'royalty free music',
+            'title': 'royalty free music',
+        }
+    }, {
+        'url': 'https://music.youtube.com/search?q=royalty+free+music&sp=EgWKAQIIAWoKEAoQAxAEEAkQBQ%3D%3D',
+        'playlist_mincount': 30,
+        'info_dict': {
+            'id': 'royalty free music - songs',
+            'title': 'royalty free music - songs',
+        },
+        'params': {'extract_flat': 'in_playlist'}
+    }, {
+        'url': 'https://music.youtube.com/search?q=royalty+free+music#community+playlists',
+        'playlist_mincount': 30,
+        'info_dict': {
+            'id': 'royalty free music - community playlists',
+            'title': 'royalty free music - community playlists',
+        },
+        'params': {'extract_flat': 'in_playlist'}
+    }]
+
+    _SECTIONS = {
+        'albums': 'EgWKAQIYAWoKEAoQAxAEEAkQBQ==',
+        'artists': 'EgWKAQIgAWoKEAoQAxAEEAkQBQ==',
+        'community playlists': 'EgeKAQQoAEABagoQChADEAQQCRAF',
+        'featured playlists': 'EgeKAQQoADgBagwQAxAJEAQQDhAKEAU==',
+        'songs': 'EgWKAQIIAWoKEAoQAxAEEAkQBQ==',
+        'videos': 'EgWKAQIQAWoKEAoQAxAEEAkQBQ==',
+    }
+
+    def _real_extract(self, url):
+        qs = parse_qs(url)
+        query = (qs.get('search_query') or qs.get('q'))[0]
+        params = qs.get('sp', (None,))[0]
+        if params:
+            section = next((k for k, v in self._SECTIONS.items() if v == params), params)
+        else:
+            section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower()
+            params = self._SECTIONS.get(section)
+            if not params:
+                section = None
+        title = join_nonempty(query, section, delim=' - ')
+        return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title)
+
+
+class YoutubeFeedsInfoExtractor(InfoExtractor):
     """
     Base class for feed extractors
     Subclasses must define the _FEED_NAME property.
@@ -5325,8 +5446,7 @@ def IE_NAME(self):
 
     def _real_extract(self, url):
         return self.url_result(
-            'https://www.youtube.com/feed/%s' % self._FEED_NAME,
-            ie=YoutubeTabIE.ie_key())
+            f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key())
 
 
 class YoutubeWatchLaterIE(InfoExtractor):