[youtube] Make --write-annotations non fatal (closes #21452)

[yt-dlp.git] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 5e0a9e10cb56d3a906102e3dced5da98a09c59ec..1010c8616c83f5cabb06c5a18d7c7578d9af1354 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -16,6 +16,7 @@
  from ..swfinterp import SWFInterpreter
  from ..compat import (
      compat_chr,
+    compat_HTTPError,
      compat_kwargs,
      compat_parse_qs,
      compat_urllib_parse_unquote,
@@ -27,6 +28,7 @@
  )
  from ..utils import (
      clean_html,
+    dict_get,
      error_to_compat_str,
      ExtractorError,
      float_or_none,
@@ -287,10 +289,25 @@ def _entries(self, page, playlist_id):
              if not mobj:
                  break
  
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
+            count = 0
+            retries = 3
+            while count <= retries:
+                try:
+                    # Downloading page may result in intermittent 5xx HTTP error
+                    # that is usually worked around with a retry
+                    more = self._download_json(
+                        'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                        'Downloading page #%s%s'
+                        % (page_num, ' (retry #%d)' % count if count else ''),
+                        transform_source=uppercase_escape)
+                    break
+                except ExtractorError as e:
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
+                        count += 1
+                        if count <= retries:
+                            continue
+                    raise
+
              content_html = more['content_html']
              if not content_html.strip():
                  # Some webpages show a "Load more" button but they don't
@@ -483,6 +500,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
          # RTMP (unnamed)
          '_rtmp': {'protocol': 'rtmp'},
+
+        # av01 video only formats sometimes served with "unknown" codecs
+        '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
      }
      _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
  
@@ -1558,8 +1581,15 @@ def extract_id(cls, url):
          return video_id
  
      def _extract_annotations(self, video_id):
-        url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
-        return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+        return self._download_webpage(
+            'https://www.youtube.com/annotations_invideo', video_id,
+            note='Downloading annotations',
+            errnote='Unable to download video annotations', fatal=False,
+            query={
+                'features': 1,
+                'legacy': 1,
+                'video_id': video_id,
+            })
  
      @staticmethod
      def _extract_chapters(description, duration):
@@ -1652,6 +1682,9 @@ def add_dash_mpd_pr(pl_response):
          def extract_view_count(v_info):
              return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
  
+        def extract_token(v_info):
+            return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
+
          player_response = {}
  
          # Get video info
@@ -1711,7 +1744,7 @@ def extract_view_count(v_info):
                  # The general idea is to take a union of itags of both DASH manifests (for example
                  # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)
                  self.report_video_info_webpage_download(video_id)
-                for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
+                for el in ('embedded', 'detailpage', 'vevo', ''):
                      query = {
                          'video_id': video_id,
                          'ps': 'default',
@@ -1741,7 +1774,7 @@ def extract_view_count(v_info):
                          view_count = extract_view_count(get_video_info)
                      if not video_info:
                          video_info = get_video_info
-                    get_token = get_video_info.get('token') or get_video_info.get('account_playback_token')
+                    get_token = extract_token(get_video_info)
                      if get_token:
                          # Different get_video_info requests may report different results, e.g.
                          # some may report video unavailability, but some may serve it without
@@ -1752,7 +1785,7 @@ def extract_view_count(v_info):
                          # due to YouTube measures against IP ranges of hosting providers.
                          # Working around by preferring the first succeeded video_info containing
                          # the token if no such video_info yet was found.
-                        token = video_info.get('token') or video_info.get('account_playback_token')
+                        token = extract_token(video_info)
                          if not token:
                              video_info = get_video_info
                          break
@@ -1769,31 +1802,6 @@ def extract_unavailable_message():
              raise ExtractorError(
                  'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
  
-        token = video_info.get('token') or video_info.get('account_playback_token')
-        if not token:
-            if 'reason' in video_info:
-                if 'The uploader has not made this video available in your country.' in video_info['reason']:
-                    regions_allowed = self._html_search_meta(
-                        'regionsAllowed', video_webpage, default=None)
-                    countries = regions_allowed.split(',') if regions_allowed else None
-                    self.raise_geo_restricted(
-                        msg=video_info['reason'][0], countries=countries)
-                reason = video_info['reason'][0]
-                if 'Invalid parameters' in reason:
-                    unavailable_message = extract_unavailable_message()
-                    if unavailable_message:
-                        reason = unavailable_message
-                raise ExtractorError(
-                    'YouTube said: %s' % reason,
-                    expected=True, video_id=video_id)
-            else:
-                raise ExtractorError(
-                    '"token" parameter not in video info for unknown reason',
-                    video_id=video_id)
-
-        if video_info.get('license_info'):
-            raise ExtractorError('This video is DRM protected.', expected=True)
-
          video_details = try_get(
              player_response, lambda x: x['videoDetails'], dict) or {}
  
@@ -1929,7 +1937,7 @@ def _extract_filesize(media_url):
              formats = []
              for url_data_str in encoded_url_map.split(','):
                  url_data = compat_parse_qs(url_data_str)
-                if 'itag' not in url_data or 'url' not in url_data:
+                if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'):
                      continue
                  stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
                  # Unsupported FORMAT_STREAM_TYPE_OTF
@@ -1989,7 +1997,8 @@ def _extract_filesize(media_url):
  
                      signature = self._decrypt_signature(
                          encrypted_sig, video_id, player_url, age_gate)
-                    url += '&signature=' + signature
+                    sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
+                    url += '&%s=%s' % (sp, signature)
                  if 'ratebypass' not in url:
                      url += '&ratebypass=yes'
  
@@ -2053,8 +2062,8 @@ def _extract_filesize(media_url):
                  url_or_none(try_get(
                      player_response,
                      lambda x: x['streamingData']['hlsManifestUrl'],
-                    compat_str)) or
-                url_or_none(try_get(
+                    compat_str))
+                or url_or_none(try_get(
                      video_info, lambda x: x['hlsvp'][0], compat_str)))
              if manifest_url:
                  formats = []
@@ -2102,8 +2111,13 @@ def _extract_filesize(media_url):
          else:
              self._downloader.report_warning('unable to extract uploader nickname')
  
-        channel_id = self._html_search_meta(
-            'channelId', video_webpage, 'channel id')
+        channel_id = (
+            str_or_none(video_details.get('channelId'))
+            or self._html_search_meta(
+                'channelId', video_webpage, 'channel id', default=None)
+            or self._search_regex(
+                r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
+                video_webpage, 'channel id', default=None, group='id'))
          channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
  
          # thumbnail image
@@ -2223,6 +2237,10 @@ def _extract_count(count_name):
                  r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
                  'view count', default=None))
  
+        average_rating = (
+            float_or_none(video_details.get('averageRating'))
+            or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
+
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
          automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
@@ -2296,6 +2314,32 @@ def decrypt_sig(mobj):
                      if f.get('vcodec') != 'none':
                          f['stretched_ratio'] = ratio
  
+        if not formats:
+            token = extract_token(video_info)
+            if not token:
+                if 'reason' in video_info:
+                    if 'The uploader has not made this video available in your country.' in video_info['reason']:
+                        regions_allowed = self._html_search_meta(
+                            'regionsAllowed', video_webpage, default=None)
+                        countries = regions_allowed.split(',') if regions_allowed else None
+                        self.raise_geo_restricted(
+                            msg=video_info['reason'][0], countries=countries)
+                    reason = video_info['reason'][0]
+                    if 'Invalid parameters' in reason:
+                        unavailable_message = extract_unavailable_message()
+                        if unavailable_message:
+                            reason = unavailable_message
+                    raise ExtractorError(
+                        'YouTube said: %s' % reason,
+                        expected=True, video_id=video_id)
+                else:
+                    raise ExtractorError(
+                        '"token" parameter not in video info for unknown reason',
+                        video_id=video_id)
+
+        if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
          self._sort_formats(formats)
  
          self.mark_watched(video_id, video_info, player_response)
@@ -2326,7 +2370,7 @@ def decrypt_sig(mobj):
              'view_count': view_count,
              'like_count': like_count,
              'dislike_count': dislike_count,
-            'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
+            'average_rating': average_rating,
              'formats': formats,
              'is_live': is_live,
              'start_time': start_time,
@@ -2537,9 +2581,9 @@ def _extract_mix(self, playlist_id):
  
          search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
          title_span = (
-            search_title('playlist-title') or
-            search_title('title long-title') or
-            search_title('title'))
+            search_title('playlist-title')
+            or search_title('title long-title')
+            or search_title('title'))
          title = clean_html(title_span)
  
          return self.playlist_result(url_results, playlist_id, title)