[cleanup] Misc cleanup

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 078f49696ed9d4447d3b3e6dd7e6ee4da76d12bd..5546aa9a34f196f45071511c10fdc353a05625b5 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -1,3 +1,4 @@
+import base64
  import calendar
  import copy
  import datetime
@@ -30,6 +31,7 @@
      NO_DEFAULT,
      ExtractorError,
      bug_reports_message,
+    classproperty,
      clean_html,
      datetime_from_str,
      dict_get,
@@ -347,6 +349,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
          r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
          r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
          r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
+        # piped instances from https://github.com/TeamPiped/Piped/wiki/Instances
+        r'(?:www\.)?piped\.kavin\.rocks',
+        r'(?:www\.)?piped\.silkky\.cloud',
+        r'(?:www\.)?piped\.tokhmi\.xyz',
+        r'(?:www\.)?piped\.moomoo\.me',
+        r'(?:www\.)?il\.ax',
+        r'(?:www\.)?piped\.syncpundit\.com',
+        r'(?:www\.)?piped\.mha\.fi',
+        r'(?:www\.)?piped\.mint\.lgbt',
+        r'(?:www\.)?piped\.privacy\.com\.de',
      )
  
      def _initialize_consent(self):
@@ -382,9 +394,7 @@ def _real_initialize(self):
          self._check_login_required()
  
      def _check_login_required(self):
-        if (self._LOGIN_REQUIRED
-                and self.get_param('cookiefile') is None
-                and self.get_param('cookiesfrombrowser') is None):
+        if self._LOGIN_REQUIRED and not self._cookies_passed:
              self.raise_login_required('Login details are needed to download this content', method='cookies')
  
      _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
@@ -2199,7 +2209,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'description': 'md5:2ef1d002cad520f65825346e2084e49d',
              },
              'params': {'skip_download': True}
-        },
+        }, {
+            # Story. Requires specific player params to work.
+            # Note: stories get removed after some period of time
+            'url': 'https://www.youtube.com/watch?v=yN3x1t3sieA',
+            'info_dict': {
+                'id': 'yN3x1t3sieA',
+                'ext': 'mp4',
+                'uploader': 'Linus Tech Tips',
+                'duration': 13,
+                'channel': 'Linus Tech Tips',
+                'playable_in_embed': True,
+                'tags': [],
+                'age_limit': 0,
+                'uploader_url': 'http://www.youtube.com/user/LinusTechTips',
+                'upload_date': '20220402',
+                'thumbnail': 'https://i.ytimg.com/vi_webp/yN3x1t3sieA/maxresdefault.webp',
+                'title': 'Story',
+                'live_status': 'not_live',
+                'uploader_id': 'LinusTechTips',
+                'view_count': int,
+                'description': '',
+                'channel_id': 'UCXuqSBlHAE6Xw-yeJA0Tunw',
+                'categories': ['Science & Technology'],
+                'channel_url': 'https://www.youtube.com/channel/UCXuqSBlHAE6Xw-yeJA0Tunw',
+                'availability': 'unlisted',
+            }
+        }
      ]
  
      @classmethod
@@ -2831,12 +2867,17 @@ def extract_thread(contents):
              lambda p: int_or_none(p, default=sys.maxsize), self._configuration_arg('max_comments', ) + [''] * 4)
  
          continuation = self._extract_continuation(root_continuation_data)
-        message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1)
-        if message and not parent:
-            self.report_warning(message, video_id=video_id)
  
          response = None
+        is_forced_continuation = False
          is_first_continuation = parent is None
+        if is_first_continuation and not continuation:
+            # Sometimes you can get comments by generating the continuation yourself,
+            # even if YouTube initially reports them being disabled - e.g. stories comments.
+            # Note: if the comment section is actually disabled, YouTube may return a response with
+            # required check_get_keys missing. So we will disable that check initially in this case.
+            continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id))
+            is_forced_continuation = True
  
          for page_num in itertools.count(0):
              if not continuation:
@@ -2857,8 +2898,8 @@ def extract_thread(contents):
              response = self._extract_response(
                  item_id=None, query=continuation,
                  ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
-                check_get_keys='onResponseReceivedEndpoints')
-
+                check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None)
+            is_forced_continuation = False
              continuation_contents = traverse_obj(
                  response, 'onResponseReceivedEndpoints', expected_type=list, default=[])
  
@@ -2883,6 +2924,18 @@ def extract_thread(contents):
                  if continuation:
                      break
  
+        message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1)
+        if message and not parent and tracker['running_total'] == 0:
+            self.report_warning(f'Youtube said: {message}', video_id=video_id, only_once=True)
+
+    @staticmethod
+    def _generate_comment_continuation(video_id):
+        """
+        Generates initial comment section continuation token from given video id
+        """
+        token = f'\x12\r\x12\x0b{video_id}\x18\x062\'"\x11"\x0b{video_id}0\x00x\x020\x00B\x10comments-section'
+        return base64.b64encode(token.encode()).decode()
+
      def _get_comments(self, ytcfg, video_id, contents, webpage):
          """Entry for comment extraction"""
          def _real_comment_extract(contents):
@@ -2936,7 +2989,10 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
          headers = self.generate_api_headers(
              ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client)
  
-        yt_query = {'videoId': video_id}
+        yt_query = {
+            'videoId': video_id,
+            'params': '8AEB'  # enable stories
+        }
          yt_query.update(self._generate_player_context(sts))
          return self._extract_response(
              item_id=video_id, ep='player', query=yt_query,
@@ -3126,7 +3182,8 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati
              # Eg: __2ABJjxzNo, ySuUZEjARPY
              is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500)
              if is_damaged:
-                self.report_warning(f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
+                self.report_warning(
+                    f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
              dct = {
                  'asr': int_or_none(fmt.get('audioSampleRate')),
                  'filesize': int_or_none(fmt.get('contentLength')),
@@ -3136,7 +3193,8 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati
                                ' (default)' if language_preference > 0 else ''),
                      fmt.get('qualityLabel') or quality.replace('audio_quality_', ''),
                      throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '),
-                'source_preference': -10 if throttled else -1,
+                # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372
+                'source_preference': -10 if throttled else -5 if itag == '22' else -1,
                  'fps': int_or_none(fmt.get('fps')) or None,
                  'height': height,
                  'quality': q(quality),
@@ -3174,6 +3232,8 @@ def _extract_formats(self, streaming_data, video_id, player_url, is_live, durati
          skip_manifests = self._configuration_arg('skip')
          if not self.get_param('youtube_include_hls_manifest', True):
              skip_manifests.append('hls')
+        if not self.get_param('youtube_include_dash_manifest', True):
+            skip_manifests.append('dash')
          get_dash = 'dash' not in skip_manifests and (
              not is_live or live_from_start or self._configuration_arg('include_live_dash'))
          get_hls = not live_from_start and 'hls' not in skip_manifests
@@ -3251,7 +3311,7 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url):
          webpage = None
          if 'webpage' not in self._configuration_arg('player_skip'):
              webpage = self._download_webpage(
-                webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
+                webpage_url + '&bpctr=9999999999&has_verified=1&pp=8AEB', video_id, fatal=False)
  
          master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg()
  
@@ -3402,13 +3462,13 @@ def feed_entry(name):
          original_thumbnails = thumbnails.copy()
  
          # The best resolution thumbnails sometimes does not appear in the webpage
-        # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
+        # See: https://github.com/yt-dlp/yt-dlp/issues/340
          # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
          thumbnail_names = [
-            'maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3',
-            'hqdefault', 'hq1', 'hq2', 'hq3', '0',
-            'mqdefault', 'mq1', 'mq2', 'mq3',
-            'default', '1', '2', '3'
+            # While the *1,*2,*3 thumbnails are just below their correspnding "*default" variants
+            # in resolution, these are not the custom thumbnail. So de-prioritize them
+            'maxresdefault', 'hq720', 'sddefault', 'hqdefault', '0', 'mqdefault', 'default',
+            'sd1', 'sd2', 'sd3', 'hq1', 'hq2', 'hq3', 'mq1', 'mq2', 'mq3', '1', '2', '3'
          ]
          n_thumbnail_names = len(thumbnail_names)
          thumbnails.extend({
@@ -3696,7 +3756,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
              unified_strdate(get_first(microformats, 'uploadDate'))
              or unified_strdate(search_meta('uploadDate')))
          if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'):
-            upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d')
+            upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') or upload_date
          info['upload_date'] = upload_date
  
          for to, frm in fallbacks.items():
@@ -4211,7 +4271,7 @@ def _get_uncropped(url):
                  self._extract_visitor_data(data, ytcfg)),
              **metadata)
  
-    def _extract_mix_playlist(self, playlist, playlist_id, data, ytcfg):
+    def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg):
          first_id = last_id = response = None
          for page_num in itertools.count(1):
              videos = list(self._playlist_entries(playlist))
@@ -4220,11 +4280,7 @@ def _extract_mix_playlist(self, playlist, playlist_id, data, ytcfg):
              start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1
              if start >= len(videos):
                  return
-            for video in videos[start:]:
-                if video['id'] == first_id:
-                    self.to_screen('First video %s found again; Assuming end of Mix' % first_id)
-                    return
-                yield video
+            yield from videos[start:]
              first_id = first_id or videos[0]['id']
              last_id = videos[-1]['id']
              watch_endpoint = try_get(
@@ -4255,13 +4311,18 @@ def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg):
          playlist_url = urljoin(url, try_get(
              playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
              compat_str))
-        if playlist_url and playlist_url != url:
+
+        # Some playlists are unviewable but YouTube still provides a link to the (broken) playlist page [1]
+        # [1] MLCT, RLTDwFCb4jeqaKWnciAYM-ZVHg
+        is_known_unviewable = re.fullmatch(r'MLCT|RLTD[\w-]{22}', playlist_id)
+
+        if playlist_url and playlist_url != url and not is_known_unviewable:
              return self.url_result(
                  playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
                  video_title=title)
  
          return self.playlist_result(
-            self._extract_mix_playlist(playlist, playlist_id, data, ytcfg),
+            self._extract_inline_playlist(playlist, playlist_id, data, ytcfg),
              playlist_id=playlist_id, playlist_title=title)
  
      def _extract_availability(self, data):
@@ -5728,16 +5789,17 @@ def _real_extract(self, url):
  class YoutubeFeedsInfoExtractor(InfoExtractor):
      """
      Base class for feed extractors
-    Subclasses must define the _FEED_NAME property.
+    Subclasses must re-define the _FEED_NAME property.
      """
      _LOGIN_REQUIRED = True
+    _FEED_NAME = 'feeds'
  
      def _real_initialize(self):
          YoutubeBaseInfoExtractor._check_login_required(self)
  
-    @property
+    @classproperty
      def IE_NAME(self):
-        return 'youtube:%s' % self._FEED_NAME
+        return f'youtube:{self._FEED_NAME}'
  
      def _real_extract(self, url):
          return self.url_result(
@@ -5798,6 +5860,22 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
      }]
  
  
+class YoutubeStoriesIE(InfoExtractor):
+    IE_DESC = 'YouTube channel stories; "ytstories:" prefix'
+    IE_NAME = 'youtube:stories'
+    _VALID_URL = r'ytstories:UC(?P<id>[A-Za-z0-9_-]{21}[AQgw])$'
+    _TESTS = [{
+        'url': 'ytstories:UCwFCb4jeqaKWnciAYM-ZVHg',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = f'RLTD{self._match_id(url)}'
+        return self.url_result(
+            f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1',
+            ie=YoutubeTabIE, video_id=playlist_id)
+
+
  class YoutubeTruncatedURLIE(InfoExtractor):
      IE_NAME = 'youtube:truncated_url'
      IE_DESC = False  # Do not list