[utils] Parse `vp09` as vp9

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 54f5ef15cc16a129783727cd6a43aaf4abf2bb68..64475edec0876c05a8b302c14e1bb9215fa0903b 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -695,7 +695,7 @@ def _extract_video(self, renderer):
  
  
  class YoutubeIE(YoutubeBaseInfoExtractor):
-    IE_DESC = 'YouTube.com'
+    IE_DESC = 'YouTube'
      _INVIDIOUS_SITES = (
          # invidious-redirect websites
          r'(?:www\.)?redirect\.invidious\.io',
@@ -1720,7 +1720,7 @@ def _extract_player_info(cls, player_url):
              raise ExtractorError('Cannot identify player %r' % player_url)
          return id_m.group('id')
  
-    def _load_player(self, video_id, player_url, fatal=True) -> bool:
+    def _load_player(self, video_id, player_url, fatal=True):
          player_id = self._extract_player_info(player_url)
          if player_id not in self._code_cache:
              code = self._download_webpage(
@@ -1729,7 +1729,7 @@ def _load_player(self, video_id, player_url, fatal=True) -> bool:
                  errnote='Download of %s failed' % player_url)
              if code:
                  self._code_cache[player_id] = code
-        return player_id in self._code_cache
+        return self._code_cache.get(player_id)
  
      def _extract_signature_function(self, video_id, player_url, example_sig):
          player_id = self._extract_player_info(player_url)
@@ -1743,8 +1743,8 @@ def _extract_signature_function(self, video_id, player_url, example_sig):
          if cache_spec is not None:
              return lambda s: ''.join(s[i] for i in cache_spec)
  
-        if self._load_player(video_id, player_url):
-            code = self._code_cache[player_id]
+        code = self._load_player(video_id, player_url)
+        if code:
              res = self._parse_sig_js(code)
  
              test_string = ''.join(map(compat_chr, range(len(example_sig))))
@@ -1755,6 +1755,9 @@ def _extract_signature_function(self, video_id, player_url, example_sig):
              return res
  
      def _print_sig_code(self, func, example_sig):
+        if not self.get_param('youtube_print_sig_code'):
+            return
+
          def gen_sig_code(idxs):
              def _genslice(start, end, step):
                  starts = '' if start == 0 else str(start)
@@ -1831,13 +1834,58 @@ def _decrypt_signature(self, s, video_id, player_url):
                  )
                  self._player_cache[player_id] = func
              func = self._player_cache[player_id]
-            if self.get_param('youtube_print_sig_code'):
-                self._print_sig_code(func, s)
+            self._print_sig_code(func, s)
              return func(s)
          except Exception as e:
-            tb = traceback.format_exc()
-            raise ExtractorError(
-                'Signature extraction failed: ' + tb, cause=e)
+            raise ExtractorError('Signature extraction failed: ' + traceback.format_exc(), cause=e)
+
+    def _decrypt_nsig(self, s, video_id, player_url):
+        """Turn the encrypted n field into a working signature"""
+        if player_url is None:
+            raise ExtractorError('Cannot decrypt nsig without player_url')
+        if player_url.startswith('//'):
+            player_url = 'https:' + player_url
+        elif not re.match(r'https?://', player_url):
+            player_url = compat_urlparse.urljoin(
+                'https://www.youtube.com', player_url)
+
+        sig_id = ('nsig_value', s)
+        if sig_id in self._player_cache:
+            return self._player_cache[sig_id]
+
+        try:
+            player_id = ('nsig', player_url)
+            if player_id not in self._player_cache:
+                self._player_cache[player_id] = self._extract_n_function(video_id, player_url)
+            func = self._player_cache[player_id]
+            self._player_cache[sig_id] = func(s)
+            self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}')
+            return self._player_cache[sig_id]
+        except Exception as e:
+            raise ExtractorError(traceback.format_exc(), cause=e)
+
+    def _extract_n_function_name(self, jscode):
+        return self._search_regex(
+            (r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',),
+            jscode, 'Initial JS player n function name', group='nfunc')
+
+    def _extract_n_function(self, video_id, player_url):
+        player_id = self._extract_player_info(player_url)
+        func_code = self._downloader.cache.load('youtube-nsig', player_id)
+
+        if func_code:
+            jsi = JSInterpreter(func_code)
+        else:
+            jscode = self._load_player(video_id, player_url)
+            funcname = self._extract_n_function_name(jscode)
+            jsi = JSInterpreter(jscode)
+            func_code = jsi.extract_function_code(funcname)
+            self._downloader.cache.store('youtube-nsig', player_id, func_code)
+
+        if self.get_param('youtube_print_sig_code'):
+            self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n')
+
+        return lambda s: jsi.extract_function_from_code(*func_code)([s])
  
      def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
          """
@@ -1856,9 +1904,8 @@ def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=F
                      raise ExtractorError(error_msg)
                  self.report_warning(error_msg)
                  return
-            if self._load_player(video_id, player_url, fatal=fatal):
-                player_id = self._extract_player_info(player_url)
-                code = self._code_cache[player_id]
+            code = self._load_player(video_id, player_url, fatal=fatal)
+            if code:
                  sts = int_or_none(self._search_regex(
                      r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code,
                      'JS player signature timestamp', group='sts', fatal=fatal))
@@ -2387,7 +2434,7 @@ def append_client(client_name):
          return prs, player_url
  
      def _extract_formats(self, streaming_data, video_id, player_url, is_live):
-        itags, stream_ids = [], []
+        itags, stream_ids = {}, []
          itag_qualities, res_qualities = {}, {}
          q = qualities([
              # Normally tiny is the smallest video-only formats. But
@@ -2440,8 +2487,18 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live):
                  sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
                  fmt_url += '&' + sp + '=' + signature
  
+            query = parse_qs(fmt_url)
+            throttled = False
+            if query.get('ratebypass') != ['yes'] and query.get('n'):
+                try:
+                    fmt_url = update_url_query(fmt_url, {
+                        'n': self._decrypt_nsig(query['n'][0], video_id, player_url)})
+                except ExtractorError as e:
+                    self.report_warning(f'nsig extraction failed: You may experience throttling for some formats\n{e}', only_once=True)
+                    throttled = True
+
              if itag:
-                itags.append(itag)
+                itags[itag] = 'https'
                  stream_ids.append(stream_id)
  
              tbr = float_or_none(
@@ -2453,7 +2510,9 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live):
                  'format_note': ', '.join(filter(None, (
                      '%s%s' % (audio_track.get('displayName') or '',
                                ' (default)' if audio_track.get('audioIsDefault') else ''),
-                    fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))),
+                    fmt.get('qualityLabel') or quality.replace('audio_quality_', ''),
+                    throttled and 'THROTTLED'))),
+                'source_preference': -10 if not throttled else -1,
                  'fps': int_or_none(fmt.get('fps')),
                  'height': height,
                  'quality': q(quality),
@@ -2489,46 +2548,36 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live):
              and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True))
          get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
  
-        def guess_quality(f):
-            for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)):
-                if val in qdict:
-                    return q(qdict[val])
-            return -1
+        def process_manifest_format(f, proto, itag):
+            if itag in itags:
+                if itags[itag] == proto or f'{itag}-{proto}' in itags:
+                    return False
+                itag = f'{itag}-{proto}'
+            if itag:
+                f['format_id'] = itag
+                itags[itag] = proto
+
+            f['quality'] = next((
+                q(qdict[val])
+                for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities))
+                if val in qdict), -1)
+            return True
  
          for sd in streaming_data:
              hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
              if hls_manifest_url:
                  for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
-                    itag = self._search_regex(
-                        r'/itag/(\d+)', f['url'], 'itag', default=None)
-                    if itag in itags:
-                        itag += '-hls'
-                        if itag in itags:
-                            continue
-                    if itag:
-                        f['format_id'] = itag
-                        itags.append(itag)
-                    f['quality'] = guess_quality(f)
-                    yield f
+                    if process_manifest_format(f, 'hls', self._search_regex(
+                            r'/itag/(\d+)', f['url'], 'itag', default=None)):
+                        yield f
  
              dash_manifest_url = get_dash and sd.get('dashManifestUrl')
              if dash_manifest_url:
                  for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
-                    itag = f['format_id']
-                    if itag in itags:
-                        itag += '-dash'
-                        if itag in itags:
-                            continue
-                    if itag:
-                        f['format_id'] = itag
-                        itags.append(itag)
-                    f['quality'] = guess_quality(f)
-                    filesize = int_or_none(self._search_regex(
-                        r'/clen/(\d+)', f.get('fragment_base_url')
-                        or f['url'], 'file size', default=None))
-                    if filesize:
-                        f['filesize'] = filesize
-                    yield f
+                    if process_manifest_format(f, 'dash', f['format_id']):
+                        f['filesize'] = int_or_none(self._search_regex(
+                            r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
+                        yield f
  
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
@@ -2645,12 +2694,6 @@ def feed_entry(name):
              if reason:
                  self.raise_no_formats(reason, expected=True)
  
-        for f in formats:
-            if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']:  # throttled
-                f['source_preference'] = -10
-                # TODO: this method is not reliable
-                f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)'
-
          # Source is given priority since formats that throttle are given lower source_preference
          # When throttling issue is fully fixed, remove this
          self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang'))
@@ -2696,6 +2739,8 @@ def feed_entry(name):
              thumbnails.append({
                  'url': thumbnail_url,
              })
+        original_thumbnails = thumbnails.copy()
+
          # The best resolution thumbnails sometimes does not appear in the webpage
          # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
          # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
@@ -2706,7 +2751,6 @@ def feed_entry(name):
              'default', '1', '2', '3'
          ]
          n_thumbnail_names = len(thumbnail_names)
-
          thumbnails.extend({
              'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
                  video_id=video_id, name=name, ext=ext,
@@ -2716,6 +2760,7 @@ def feed_entry(name):
              i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
              thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
          self._remove_duplicate_formats(thumbnails)
+        self._downloader._sort_thumbnails(original_thumbnails)
  
          category = get_first(microformats, 'category') or search_meta('genre')
          channel_id = str_or_none(
@@ -2745,6 +2790,9 @@ def feed_entry(name):
              'title': self._live_title(video_title) if is_live else video_title,
              'formats': formats,
              'thumbnails': thumbnails,
+            # The best thumbnail that we are sure exists. Prevents unnecessary
+            # URL checking if user don't care about getting the best possible thumbnail
+            'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')),
              'description': video_description,
              'upload_date': unified_strdate(
                  get_first(microformats, 'uploadDate')
@@ -3010,7 +3058,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
  
  
  class YoutubeTabIE(YoutubeBaseInfoExtractor):
-    IE_DESC = 'YouTube.com tab'
+    IE_DESC = 'YouTube Tabs'
      _VALID_URL = r'''(?x)
                      https?://
                          (?:\w+\.)?
@@ -4238,7 +4286,7 @@ def get_mobj(url):
  
  
  class YoutubePlaylistIE(InfoExtractor):
-    IE_DESC = 'YouTube.com playlists'
+    IE_DESC = 'YouTube playlists'
      _VALID_URL = r'''(?x)(?:
                          (?:https?://)?
                          (?:\w+\.)?
@@ -4362,7 +4410,7 @@ def _real_extract(self, url):
  
  
  class YoutubeYtUserIE(InfoExtractor):
-    IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
+    IE_DESC = 'YouTube user videos; "ytuser:" prefix'
      _VALID_URL = r'ytuser:(?P<id>.+)'
      _TESTS = [{
          'url': 'ytuser:phihag',
@@ -4378,7 +4426,7 @@ def _real_extract(self, url):
  
  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
      IE_NAME = 'youtube:favorites'
-    IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
+    IE_DESC = 'YouTube liked videos; ":ytfav" keyword (requires cookies)'
      _VALID_URL = r':ytfav(?:ou?rite)?s?'
      _LOGIN_REQUIRED = True
      _TESTS = [{
@@ -4396,10 +4444,7 @@ def _real_extract(self, url):
  
  
  class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
-    IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
-    # there doesn't appear to be a real limit, for example if you search for
-    # 'python' you get more than 8.000.000 results
-    _MAX_RESULTS = float('inf')
+    IE_DESC = 'YouTube searches'
      IE_NAME = 'youtube:search'
      _SEARCH_KEY = 'ytsearch'
      _SEARCH_PARAMS = None
@@ -4459,13 +4504,14 @@ def _search_results(self, query):
  class YoutubeSearchDateIE(YoutubeSearchIE):
      IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
      _SEARCH_KEY = 'ytsearchdate'
-    IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
+    IE_DESC = 'YouTube searches, newest videos first'
      _SEARCH_PARAMS = 'CAI%3D'
  
  
  class YoutubeSearchURLIE(YoutubeSearchIE):
-    IE_DESC = 'YouTube.com search URLs'
+    IE_DESC = 'YouTube search URLs with sorting and filter support'
      IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
+    _SEARCH_KEY = None
      _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
      # _MAX_RESULTS = 100
      _TESTS = [{
@@ -4511,7 +4557,7 @@ def _real_extract(self, url):
  
  class YoutubeWatchLaterIE(InfoExtractor):
      IE_NAME = 'youtube:watchlater'
-    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+    IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)'
      _VALID_URL = r':ytwatchlater'
      _TESTS = [{
          'url': ':ytwatchlater',
@@ -4524,7 +4570,7 @@ def _real_extract(self, url):
  
  
  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
-    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+    IE_DESC = 'YouTube recommended videos; ":ytrec" keyword'
      _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
      _FEED_NAME = 'recommended'
      _LOGIN_REQUIRED = False
@@ -4541,7 +4587,7 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
  
  
  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
-    IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
+    IE_DESC = 'YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)'
      _VALID_URL = r':ytsub(?:scription)?s?'
      _FEED_NAME = 'subscriptions'
      _TESTS = [{
@@ -4554,7 +4600,7 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
  
  
  class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
-    IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
+    IE_DESC = 'Youtube watch history; ":ythis" keyword (requires cookies)'
      _VALID_URL = r':ythis(?:tory)?'
      _FEED_NAME = 'history'
      _TESTS = [{