Add option `--extractor-retries` to retry on known extractor errors

author pukkandan <redacted>

Sun, 28 Feb 2021 23:48:37 +0000 (05:18 +0530)

committer pukkandan <redacted>

Sun, 28 Feb 2021 23:48:37 +0000 (05:18 +0530)
author pukkandan <redacted>
Sun, 28 Feb 2021 23:48:37 +0000 (05:18 +0530)
committer pukkandan <redacted>
Sun, 28 Feb 2021 23:48:37 +0000 (05:18 +0530)
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index e9cb7e18767f0d0ca0047871d2e17d308d82dc40..d1f365814d2c7e7ee1d33cc4ea02fce70c2d47a7 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -381,17 +381,18 @@ class YoutubeDL(object):
                          Use 'default' as the name for arguments to passed to all PP
  
      The following options are used by the extractors:
-    dynamic_mpd:        Whether to process dynamic DASH manifests (default: True)
+    extractor_retries: Number of times to retry for known errors
+    dynamic_mpd:       Whether to process dynamic DASH manifests (default: True)
      hls_split_discontinuity: Split HLS playlists to different formats at
-                        discontinuities such as ad breaks (default: False)
+                       discontinuities such as ad breaks (default: False)
      youtube_include_dash_manifest: If True (default), DASH manifests and related
-                        data will be downloaded and processed by extractor.
-                        You can reduce network I/O by disabling it if you don't
-                        care about DASH. (only for youtube)
+                       data will be downloaded and processed by extractor.
+                       You can reduce network I/O by disabling it if you don't
+                       care about DASH. (only for youtube)
      youtube_include_hls_manifest: If True (default), HLS manifests and related
-                        data will be downloaded and processed by extractor.
-                        You can reduce network I/O by disabling it if you don't
-                        care about HLS. (only for youtube)
+                       data will be downloaded and processed by extractor.
+                       You can reduce network I/O by disabling it if you don't
+                       care about HLS. (only for youtube)
      """
  
      _NUMERIC_FIELDS = set((
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py

index 2fd49cc8f5be32ba86d5eb8fd0728e7bfc8831ab..b8b8495e6f3cb40da6e30e60ccb36d983564c18f 100644 (file)
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -181,19 +181,21 @@ def _real_main(argv=None):
          # --yes-overwrites implies --no-continue
          opts.continue_dl = False
  
-    def parse_retries(retries):
+    def parse_retries(retries, name=''):
          if retries in ('inf', 'infinite'):
              parsed_retries = float('inf')
          else:
              try:
                  parsed_retries = int(retries)
              except (TypeError, ValueError):
-                parser.error('invalid retry count specified')
+                parser.error('invalid %sretry count specified' % name)
          return parsed_retries
      if opts.retries is not None:
          opts.retries = parse_retries(opts.retries)
      if opts.fragment_retries is not None:
-        opts.fragment_retries = parse_retries(opts.fragment_retries)
+        opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ')
+    if opts.extractor_retries is not None:
+        opts.extractor_retries = parse_retries(opts.extractor_retries, 'extractor ')
      if opts.buffersize is not None:
          numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
          if numeric_buffersize is None:
@@ -458,6 +460,7 @@ def report_args_compat(arg, name):
          'overwrites': opts.overwrites,
          'retries': opts.retries,
          'fragment_retries': opts.fragment_retries,
+        'extractor_retries': opts.extractor_retries,
          'skip_unavailable_fragments': opts.skip_unavailable_fragments,
          'keep_fragments': opts.keep_fragments,
          'buffersize': opts.buffersize,
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 804186b851e0520c56300193cda8acb765e84e7c..2e4ce4c128a5a2c4f1373ffe41d57ad4ad00cbb5 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2762,28 +2762,36 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
          for page_num in itertools.count(1):
              if not continuation:
                  break
-            count = 0
-            retries = 3
-            while count <= retries:
+            retries = self._downloader.params.get('extractor_retries', 3)
+            count = -1
+            last_error = None
+            while count < retries:
+                count += 1
+                if last_error:
+                    self.report_warning('%s. Retrying ...' % last_error)
                  try:
-                    # Downloading page may result in intermittent 5xx HTTP error
-                    # that is usually worked around with a retry
                      browse = self._download_json(
                          'https://www.youtube.com/browse_ajax', None,
                          'Downloading page %d%s'
                          % (page_num, ' (retry #%d)' % count if count else ''),
                          headers=headers, query=continuation)
-                    break
                  except ExtractorError as e:
-                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
-                        count += 1
-                        if count <= retries:
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
+                        # Downloading page may result in intermittent 5xx HTTP error
+                        # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
+                        last_error = 'HTTP Error %s' % e.cause.code
+                        if count < retries:
                              continue
                      raise
-            if not browse:
-                break
-            response = try_get(browse, lambda x: x[1]['response'], dict)
-            if not response:
+                else:
+                    response = try_get(browse, lambda x: x[1]['response'], dict)
+
+                    # Youtube sometimes sends incomplete data
+                    # See: https://github.com/ytdl-org/youtube-dl/issues/28194
+                    if response.get('continuationContents') or response.get('onResponseReceivedActions'):
+                        break
+                    last_error = 'Incomplete data recieved'
+            if not browse or not response:
                  break
  
              known_continuation_renderers = {
@@ -3004,11 +3012,16 @@ def _real_extract(self, url):
                  return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
              self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
-        count = 0
-        retries = 3
+        retries = self._downloader.params.get('extractor_retries', 3)
+        count = -1
          while count < retries:
+            count += 1
              # Sometimes youtube returns a webpage with incomplete ytInitialData
-            webpage = self._download_webpage(url, item_id)
+            # See: https://github.com/yt-dlp/yt-dlp/issues/116
+            if count:
+                self.report_warning('Incomplete yt initial data recieved. Retrying ...')
+            webpage = self._download_webpage(url, item_id,
+                'Downloading webpage%s' % ' (retry #%d)' % count if count else '')
              identity_token = self._extract_identity_token(webpage, item_id)
              data = self._extract_yt_initial_data(item_id, webpage)
              err_msg = None
@@ -3023,9 +3036,6 @@ def _real_extract(self, url):
                  raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
              if data.get('contents') or data.get('currentVideoEndpoint'):
                  break
-            count += 1
-            self.to_screen(
-                'Incomplete yt initial data recieved. Retrying (attempt %d of %d)...' % (count, retries))
  
          tabs = try_get(
              data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
diff --git a/yt_dlp/options.py b/yt_dlp/options.py

index 59c08fb1882fc6e51f43d4d8fe3ebf39ac1cedbd..866c50cb96395ba692fd5618892a938118cf75a0 100644 (file)
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -1218,6 +1218,10 @@ def _dict_from_multiple_values_options_callback(
          help=optparse.SUPPRESS_HELP)
  
      extractor = optparse.OptionGroup(parser, 'Extractor Options')
+    extractor.add_option(
+        '--extractor-retries',
+        dest='extractor_retries', metavar='RETRIES', default=10,
+        help='Number of retries for known extractor errors (default is %default), or "infinite"')
      extractor.add_option(
          '--allow-dynamic-mpd', '--no-ignore-dynamic-mpd',
          action='store_true', dest='dynamic_mpd', default=True,
author	pukkandan <redacted>
	Sun, 28 Feb 2021 23:48:37 +0000 (05:18 +0530)
committer	pukkandan <redacted>
	Sun, 28 Feb 2021 23:48:37 +0000 (05:18 +0530)
yt_dlp/YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/__init__.py		patch \| blob \| blame \| history
yt_dlp/extractor/youtube.py		patch \| blob \| blame \| history
yt_dlp/options.py		patch \| blob \| blame \| history