[cleanup] Revert unnecessary changes in 51d9739f8031fb37d8e25b0e9f1abea561e3d2e3

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 1d50264b6d3a3c2d273d4ccec1516cdaef6fbe49..375fc19096715361b6610ddcaca0c5c12752d066 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -67,8 +67,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
      _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
  
      _RESERVED_NAMES = (
-        r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|'
-        r'movies|results|shared|hashtag|trending|feed|feeds|oembed|'
+        r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
+        r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
          r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
  
      _NETRC_MACHINE = 'youtube'
@@ -85,7 +85,20 @@ def _login(self):
  
          If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
          """
+
+        def warn(message):
+            self.report_warning(message)
+
+        # username+password login is broken
+        if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
+            self.raise_login_required(
+                'Login details are needed to download this content', method='cookies')
          username, password = self._get_login_info()
+        if username:
+            warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
+        return
+        # Everything below this is broken!
+
          # No authentication to be performed
          if username is None:
              if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
@@ -126,9 +139,6 @@ def req(url, f_req, note, errnote):
                      'Google-Accounts-XSRF': 1,
                  })
  
-        def warn(message):
-            self.report_warning(message)
-
          lookup_req = [
              username,
              None, [], None, 'US', None, None, 2, False, True,
@@ -291,12 +301,22 @@ def _real_initialize(self):
      _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
  
      def _generate_sapisidhash_header(self):
-        sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
+        # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
+        # See: https://github.com/yt-dlp/yt-dlp/issues/393
+        yt_cookies = self._get_cookies('https://www.youtube.com')
+        sapisid_cookie = dict_get(
+            yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
          if sapisid_cookie is None:
              return
          time_now = round(time.time())
-        sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
-        return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
+        # SAPISID cookie is required if not already present
+        if not yt_cookies.get('SAPISID'):
+            self._set_cookie(
+                '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
+        # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
+        sapisidhash = hashlib.sha1(
+            f'{time_now} {sapisid_cookie.value} https://www.youtube.com'.encode('utf-8')).hexdigest()
+        return f'SAPISIDHASH {time_now}_{sapisidhash}'
  
      def _call_api(self, ep, query, video_id, fatal=True, headers=None,
                    note='Downloading API JSON', errnote='Unable to download API page',
@@ -1856,6 +1876,16 @@ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
              'comment_count': len(comments),
          }
  
+    @staticmethod
+    def _get_video_info_params(video_id):
+        return {
+            'video_id': video_id,
+            'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+            'html5': '1',
+            'c': 'TVHTML5',
+            'cver': '6.20180913',
+        }
+
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
          video_id = self._match_id(url)
@@ -1888,12 +1918,11 @@ def get_text(x):
                      base_url + 'get_video_info', video_id,
                      'Fetching youtube music info webpage',
                      'unable to download youtube music info webpage', query={
-                        'video_id': video_id,
-                        'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+                        **self._get_video_info_params(video_id),
                          'el': 'detailpage',
                          'c': 'WEB_REMIX',
                          'cver': '0.1',
-                        'cplayer': 'UNIPLAYER'
+                        'cplayer': 'UNIPLAYER',
                      }, fatal=False)),
                  lambda x: x['player_response'][0],
                  compat_str) or '{}', video_id)
@@ -1915,11 +1944,8 @@ def get_text(x):
              pr = self._parse_json(try_get(compat_parse_qs(
                  self._download_webpage(
                      base_url + 'get_video_info', video_id,
-                    'Refetching age-gated info webpage',
-                    'unable to download video info webpage', query={
-                        'video_id': video_id,
-                        'eurl': 'https://youtube.googleapis.com/v/' + video_id,
-                    }, fatal=False)),
+                    'Refetching age-gated info webpage', 'unable to download video info webpage',
+                    query=self._get_video_info_params(video_id), fatal=False)),
                  lambda x: x['player_response'][0],
                  compat_str) or '{}', video_id)
              if pr:
@@ -1994,7 +2020,10 @@ def feed_entry(name):
          formats, itags, stream_ids = [], [], []
          itag_qualities = {}
          player_url = None
-        q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
+        q = qualities([
+            'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high',  # Audio only formats
+            'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
+        ])
  
          streaming_data = player_response.get('streamingData') or {}
          streaming_formats = streaming_data.get('formats') or []
@@ -2013,6 +2042,8 @@ def feed_entry(name):
                  continue
  
              quality = fmt.get('quality')
+            if quality == 'tiny' or not quality:
+                quality = fmt.get('audioQuality', '').lower() or quality
              if itag and quality:
                  itag_qualities[itag] = quality
              # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
@@ -2090,7 +2121,7 @@ def feed_entry(name):
                          r'/itag/(\d+)', f['url'], 'itag', default=None)
                      if itag:
                          f['format_id'] = itag
-                formats.append(f)
+                    formats.append(f)
  
          if self.get_param('youtube_include_dash_manifest', True):
              for sd in (streaming_data, ytm_streaming_data):
@@ -2102,9 +2133,6 @@ def feed_entry(name):
                          if itag in itags:
                              continue
                          if itag in itag_qualities:
-                            # Not actually usefull since the sorting is already done with "quality,res,fps,codec"
-                            # but kept to maintain feature parity (and code similarity) with youtube-dl
-                            # Remove if this causes any issues with sorting in future
                              f['quality'] = q(itag_qualities[itag])
                          filesize = int_or_none(self._search_regex(
                              r'/clen/(\d+)', f.get('fragment_base_url')
@@ -2169,16 +2197,24 @@ def feed_entry(name):
                  if 'maxresdefault' in thumbnail_url:
                      thumbnail_url = thumbnail_url.split('?')[0]
                  thumbnails.append({
-                    'height': int_or_none(thumbnail.get('height')),
                      'url': thumbnail_url,
+                    'height': int_or_none(thumbnail.get('height')),
                      'width': int_or_none(thumbnail.get('width')),
+                    'preference': 1 if 'maxresdefault' in thumbnail_url else -1
                  })
-            if thumbnails:
-                break
-        else:
-            thumbnail = search_meta(['og:image', 'twitter:image'])
-            if thumbnail:
-                thumbnails = [{'url': thumbnail}]
+        thumbnail_url = search_meta(['og:image', 'twitter:image'])
+        if thumbnail_url:
+            thumbnails.append({
+                'url': thumbnail_url,
+                'preference': 1 if 'maxresdefault' in thumbnail_url else -1
+            })
+        # All videos have a maxresdefault thumbnail, but sometimes it does not appear in the webpage
+        # See: https://github.com/ytdl-org/youtube-dl/issues/29049
+        thumbnails.append({
+            'url': 'https://i.ytimg.com/vi/%s/maxresdefault.jpg' % video_id,
+            'preference': 1,
+        })
+        self._remove_duplicate_formats(thumbnails)
  
          category = microformat.get('category') or search_meta('genre')
          channel_id = video_details.get('channelId') \
@@ -2832,6 +2868,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
      }, {
          'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
          'only_matching': True,
+    }, {
+        'note': 'A channel that is not live. Should raise error',
+        'url': 'https://www.youtube.com/user/numberphile/live',
+        'only_matching': True,
      }, {
          'url': 'https://www.youtube.com/feed/trending',
          'only_matching': True,
@@ -3566,7 +3606,13 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers
  
              else:
                  # Youtube may send alerts if there was an issue with the continuation page
-                self._extract_and_report_alerts(response, expected=False)
+                try:
+                    self._extract_and_report_alerts(response, expected=False)
+                except ExtractorError as e:
+                    if fatal:
+                        raise
+                    self.report_warning(error_to_compat_str(e))
+                    return
                  if not check_get_keys or dict_get(response, check_get_keys):
                      break
                  # Youtube sometimes sends incomplete data
@@ -3628,7 +3674,7 @@ def __real_extract(self, url, smuggled_data):
  
          def get_mobj(url):
              mobj = self._url_re.match(url).groupdict()
-            mobj.update((k, '') for k,v in mobj.items() if v is None)
+            mobj.update((k, '') for k, v in mobj.items() if v is None)
              return mobj
  
          mobj = get_mobj(url)
@@ -3688,23 +3734,26 @@ def get_mobj(url):
          if tabs:
              selected_tab = self._extract_selected_tab(tabs)
              tab_name = selected_tab.get('title', '')
-            if (mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]
-                    and 'no-youtube-channel-redirect' not in compat_opts):
-                if not mobj['not_channel'] and item_id[:2] == 'UC':
-                    # Topic channels don't have /videos. Use the equivalent playlist instead
-                    self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
-                    pl_id = 'UU%s' % item_id[2:]
-                    pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
-                    try:
-                        pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
-                        for alert_type, alert_message in self._extract_alerts(pl_data):
-                            if alert_type == 'error':
-                                raise ExtractorError('Youtube said: %s' % alert_message)
-                        item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
-                    except ExtractorError:
-                        self.report_warning('The playlist gave error. Falling back to channel URL')
-                else:
-                    self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
+            if 'no-youtube-channel-redirect' not in compat_opts:
+                if mobj['tab'] == '/live':
+                    # Live tab should have redirected to the video
+                    raise ExtractorError('The channel is not currently live', expected=True)
+                if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
+                    if not mobj['not_channel'] and item_id[:2] == 'UC':
+                        # Topic channels don't have /videos. Use the equivalent playlist instead
+                        self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
+                        pl_id = 'UU%s' % item_id[2:]
+                        pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
+                        try:
+                            pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
+                            for alert_type, alert_message in self._extract_alerts(pl_data):
+                                if alert_type == 'error':
+                                    raise ExtractorError('Youtube said: %s' % alert_message)
+                            item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
+                        except ExtractorError:
+                            self.report_warning('The playlist gave error. Falling back to channel URL')
+                    else:
+                        self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
  
          self.write_debug('Final URL: %s' % url)
  
@@ -3727,7 +3776,8 @@ def get_mobj(url):
              data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
              compat_str) or video_id
          if video_id:
-            self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
+            if mobj['tab'] != '/live':  # live tab is expected to redirect to video
+                self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
              return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
  
          raise ExtractorError('Unable to recognize tab page')
@@ -3808,7 +3858,7 @@ def suitable(cls, url):
  
      def _real_extract(self, url):
          playlist_id = self._match_id(url)
-        is_music_url = self.is_music_url(url)
+        is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
          url = update_url_query(
              'https://www.youtube.com/playlist',
              parse_qs(url) or {'list': playlist_id})
@@ -4008,9 +4058,6 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
      def IE_NAME(self):
          return 'youtube:%s' % self._FEED_NAME
  
-    def _real_initialize(self):
-        self._login()
-
      def _real_extract(self, url):
          return self.url_result(
              'https://www.youtube.com/feed/%s' % self._FEED_NAME,
@@ -4035,6 +4082,7 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
      IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
      _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
      _FEED_NAME = 'recommended'
+    _LOGIN_REQUIRED = False
      _TESTS = [{
          'url': ':ytrec',
          'only_matching': True,