[ie/youtube] Suppress "Unavailable videos are hidden" warning (#10159)

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index a227f2425815928dcb8aeab6776d441dd4a7ddc4..094b1e9a3624404ee7c9f492e5db0f5b6a827907 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -468,7 +468,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
          'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko',
      ]
  
-    _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'}
+    _IGNORED_WARNINGS = {
+        'Unavailable videos will be hidden during playback',
+        'Unavailable videos are hidden',
+    }
  
      _YT_HANDLE_RE = r'@[\w.-]{3,30}'  # https://support.google.com/youtube/answer/11585688?hl=en
      _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}'
@@ -885,14 +888,14 @@ def _get_count(self, data, *path_list):
          return count
  
      @staticmethod
-    def _extract_thumbnails(data, *path_list):
+    def _extract_thumbnails(data, *path_list, final_key='thumbnails'):
          """
          Extract thumbnails from thumbnails dict
          @param path_list: path list to level that contains 'thumbnails' key
          """
          thumbnails = []
          for path in path_list or [()]:
-            for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)):
+            for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)):
                  thumbnail_url = url_or_none(thumbnail.get('url'))
                  if not thumbnail_url:
                      continue
@@ -3797,6 +3800,8 @@ def _needs_live_processing(self, live_status, duration):
  
      def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration):
          CHUNK_SIZE = 10 << 20
+        PREFERRED_LANG_VALUE = 10
+        original_language = None
          itags, stream_ids = collections.defaultdict(set), []
          itag_qualities, res_qualities = {}, {0: None}
          q = qualities([
@@ -3845,6 +3850,13 @@ def build_fragments(f):
                      itag_qualities[itag] = quality
                  if height:
                      res_qualities[height] = quality
+
+            is_default = audio_track.get('audioIsDefault')
+            is_descriptive = 'descriptive' in (audio_track.get('displayName') or '').lower()
+            language_code = audio_track.get('id', '').split('.')[0]
+            if language_code and is_default:
+                original_language = language_code
+
              # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
              # (adding `&sq=0` to the URL) and parsing emsg box to determine the
              # number of fragment that would subsequently requested with (`&sq=N`)
@@ -3870,7 +3882,6 @@ def build_fragments(f):
                      continue
  
              query = parse_qs(fmt_url)
-            throttled = False
              if query.get('n'):
                  try:
                      decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0])
@@ -3884,20 +3895,16 @@ def build_fragments(f):
                                            f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n')
                      if player_url:
                          self.report_warning(
-                            f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}'
+                            f'nsig extraction failed: Some formats may be missing\n{phantomjs_hint}'
                              f'         n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True)
                          self.write_debug(e, only_once=True)
                      else:
                          self.report_warning(
-                            'Cannot decrypt nsig without player_url: You may experience throttling for some formats',
+                            'Cannot decrypt nsig without player_url: Some formats may be missing',
                              video_id=video_id, only_once=True)
-                    throttled = True
+                    continue
  
              tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
-            language_preference = (
-                10 if audio_track.get('audioIsDefault') and 10
-                else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10
-                else -1)
              format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)}))
              # Some formats may have much smaller duration than others (possibly damaged during encoding)
              # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823
@@ -3924,17 +3931,15 @@ def build_fragments(f):
                  'filesize': int_or_none(fmt.get('contentLength')),
                  'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}',
                  'format_note': join_nonempty(
-                    join_nonempty(audio_track.get('displayName'),
-                                  language_preference > 0 and ' (default)', delim=''),
+                    join_nonempty(audio_track.get('displayName'), is_default and ' (default)', delim=''),
                      name, fmt.get('isDrc') and 'DRC',
                      try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
                      try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
-                    throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN',
+                    is_damaged and 'DAMAGED', is_broken and 'BROKEN',
                      (self.get_param('verbose') or all_formats) and client_name,
                      delim=', '),
                  # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
-                'source_preference': ((-10 if throttled else -5 if itag == '22' else -1)
-                                      + (100 if 'Premium' in name else 0)),
+                'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0),
                  'fps': fps if fps > 1 else None,  # For some formats, fps is wrongly returned as 1
                  'audio_channels': fmt.get('audioChannels'),
                  'height': height,
@@ -3944,9 +3949,8 @@ def build_fragments(f):
                  'filesize_approx': filesize_from_tbr(tbr, format_duration),
                  'url': fmt_url,
                  'width': int_or_none(fmt.get('width')),
-                'language': join_nonempty(audio_track.get('id', '').split('.')[0],
-                                          'desc' if language_preference < -1 else '') or None,
-                'language_preference': language_preference,
+                'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None,
+                'language_preference': PREFERRED_LANG_VALUE if is_default else -10 if is_descriptive else -1,
                  # Strictly de-prioritize broken, damaged and 3gp formats
                  'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None,
              }
@@ -4007,6 +4011,10 @@ def process_manifest_format(f, proto, client_name, itag):
              elif itag:
                  f['format_id'] = itag
  
+            if original_language and f.get('language') == original_language:
+                f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ')
+                f['language_preference'] = PREFERRED_LANG_VALUE
+
              if f.get('source_preference') is None:
                  f['source_preference'] = -1
  
@@ -4351,7 +4359,7 @@ def is_bad_format(fmt):
              'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'),
              'live_status': live_status,
              'release_timestamp': live_start_time,
-            '_format_sort_fields': (  # source_preference is lower for throttled/potentially damaged formats
+            '_format_sort_fields': (  # source_preference is lower for potentially damaged formats
                  'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto'),
          }
  
@@ -5124,6 +5132,10 @@ def _extract_metadata_from_tabs(self, item_id, data):
          else:
              metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict)
  
+        # pageHeaderViewModel slow rollout began April 2024
+        page_header_view_model = traverse_obj(data, (
+            'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict}))
+
          # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
          # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
          def _get_uncropped(url):
@@ -5139,8 +5151,10 @@ def _get_uncropped(url):
                      'preference': 1,
                  })
  
-        channel_banners = self._extract_thumbnails(
-            data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner')))
+        channel_banners = (
+            self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner')))
+            or self._extract_thumbnails(
+                page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources'))
          for banner in channel_banners:
              banner['preference'] = -10
  
@@ -5167,7 +5181,11 @@ def _get_uncropped(url):
                        or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag'))
                        or info['id']),
              'availability': self._extract_availability(data),
-            'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
+            'channel_follower_count': (
+                self._get_count(data, ('header', ..., 'subscriberCountText'))
+                or traverse_obj(page_header_view_model, (
+                    'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts',
+                    lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))),
              'description': try_get(metadata_renderer, lambda x: x.get('description', '')),
              'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str}))
                       or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))),