[MainStreaming] Add extractor (#2180)

[yt-dlp.git] / yt_dlp / YoutubeDL.py
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 9ab8d7f08d5b3a7725978ae5ba9605d1677136c6..463251789fa3a80eff98bade0104043d3468e01a 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -91,6 +91,7 @@
      PerRequestProxyHandler,
      platform_name,
      Popen,
+    POSTPROCESS_WHEN,
      PostProcessingError,
      preferredencoding,
      prepend_extension,
@@ -199,7 +200,9 @@ class YoutubeDL(object):
      verbose:           Print additional info to stdout.
      quiet:             Do not print messages to stdout.
      no_warnings:       Do not print out anything for warnings.
-    forceprint:        A list of templates to force print
+    forceprint:        A dict with keys video/playlist mapped to
+                       a list of templates to force print to stdout
+                       For compatibility, a single list is also accepted
      forceurl:          Force printing final URL. (Deprecated)
      forcetitle:        Force printing title. (Deprecated)
      forceid:           Force printing ID. (Deprecated)
@@ -505,7 +508,7 @@ class YoutubeDL(object):
  
      params = None
      _ies = {}
-    _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
+    _pps = {k: [] for k in POSTPROCESS_WHEN}
      _printed_messages = set()
      _first_webpage_request = True
      _download_retcode = None
@@ -523,7 +526,7 @@ def __init__(self, params=None, auto_init=True):
              params = {}
          self._ies = {}
          self._ies_instances = {}
-        self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []}
+        self._pps = {k: [] for k in POSTPROCESS_WHEN}
          self._printed_messages = set()
          self._first_webpage_request = True
          self._post_hooks = []
@@ -531,6 +534,7 @@ def __init__(self, params=None, auto_init=True):
          self._postprocessor_hooks = []
          self._download_retcode = 0
          self._num_downloads = 0
+        self._num_videos = 0
          self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
          self._err_file = sys.stderr
          self.params = params
@@ -585,6 +589,11 @@ def check_deprecated(param, option, suggestion):
          else:
              self.params['nooverwrites'] = not self.params['overwrites']
  
+        # Compatibility with older syntax
+        params.setdefault('forceprint', {})
+        if not isinstance(params['forceprint'], dict):
+            params['forceprint'] = {'video': params['forceprint']}
+
          if params.get('bidi_workaround', False):
              try:
                  import pty
@@ -1036,6 +1045,7 @@ def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
              if info_dict.get('duration', None) is not None
              else None)
          info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
+        info_dict['video_autonumber'] = self._num_videos
          if info_dict.get('resolution') is None:
              info_dict['resolution'] = self.format_resolution(info_dict, default=None)
  
@@ -1151,7 +1161,7 @@ def create_key(outer_mobj):
              str_fmt = f'{fmt[:-1]}s'
              if fmt[-1] == 'l':  # list
                  delim = '\n' if '#' in flags else ', '
-                value, fmt = delim.join(variadic(value, allowed_types=(str, bytes))), str_fmt
+                value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
              elif fmt[-1] == 'j':  # json
                  value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
              elif fmt[-1] == 'q':  # quoted
@@ -1755,6 +1765,13 @@ def get_entry(i):
                  'updated playlist', ie_result,
                  self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None:
              return
+
+        for tmpl in self.params['forceprint'].get('playlist', []):
+            self._forceprint(tmpl, ie_result)
+
+        for pp in self._pps['playlist']:
+            ie_result = self.run_pp(pp, ie_result)
+
          self.to_screen('[download] Finished downloading playlist: %s' % playlist)
          return ie_result
  
@@ -2260,6 +2277,7 @@ def check_thumbnails(thumbnails):
  
      def process_video_result(self, info_dict, download=True):
          assert info_dict.get('_type', 'video') == 'video'
+        self._num_videos += 1
  
          if 'id' not in info_dict:
              raise ExtractorError('Missing "id" field in extractor result')
@@ -2378,6 +2396,9 @@ def sanitize_numeric_fields(info):
              if not get_from_start:
                  info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
  
+        # backward compatibility
+        info_dict['fulltitle'] = info_dict['title']
+
          if not formats:
              self.raise_no_formats(info_dict)
  
@@ -2538,24 +2559,47 @@ def is_wellformed(f):
              if not self.params.get('ignore_no_formats_error'):
                  raise ExtractorError('Requested format is not available', expected=True,
                                       video_id=info_dict['id'], ie=info_dict['extractor'])
-            else:
-                self.report_warning('Requested format is not available')
-                # Process what we can, even without any available formats.
-                self.process_info(dict(info_dict))
-        elif download:
-            self.to_screen(
-                '[info] %s: Downloading %d format(s): %s' % (
-                    info_dict['id'], len(formats_to_download),
-                    ", ".join([f['format_id'] for f in formats_to_download])))
-            for fmt in formats_to_download:
-                new_info = dict(info_dict)
+            self.report_warning('Requested format is not available')
+            # Process what we can, even without any available formats.
+            formats_to_download = [{}]
+
+        best_format = formats_to_download[-1]
+        if download:
+            if best_format:
+                self.to_screen(
+                    f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): '
+                    + ', '.join([f['format_id'] for f in formats_to_download]))
+            max_downloads_reached = False
+            for i, fmt in enumerate(formats_to_download):
+                formats_to_download[i] = new_info = dict(info_dict)
                  # Save a reference to the original info_dict so that it can be modified in process_info if needed
-                new_info['__original_infodict'] = info_dict
                  new_info.update(fmt)
-                self.process_info(new_info)
+                new_info['__original_infodict'] = info_dict
+                try:
+                    self.process_info(new_info)
+                except MaxDownloadsReached:
+                    max_downloads_reached = True
+                new_info.pop('__original_infodict')
+                # Remove copied info
+                for key, val in tuple(new_info.items()):
+                    if info_dict.get(key) == val:
+                        new_info.pop(key)
+                if max_downloads_reached:
+                    break
+
+            write_archive = set(f.get('__write_download_archive', False) for f in formats_to_download)
+            assert write_archive.issubset({True, False, 'ignore'})
+            if True in write_archive and False not in write_archive:
+                self.record_download_archive(info_dict)
+
+            info_dict['requested_downloads'] = formats_to_download
+            for pp in self._pps['after_video']:
+                info_dict = self.run_pp(pp, info_dict)
+            if max_downloads_reached:
+                raise MaxDownloadsReached()
+
          # We update the info dict with the selected best quality format (backwards compatibility)
-        if formats_to_download:
-            info_dict.update(formats_to_download[-1])
+        info_dict.update(best_format)
          return info_dict
  
      def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
@@ -2626,6 +2670,14 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
              subs[lang] = f
          return subs
  
+    def _forceprint(self, tmpl, info_dict):
+        mobj = re.match(r'\w+(=?)$', tmpl)
+        if mobj and mobj.group(1):
+            tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s'
+        elif mobj:
+            tmpl = '%({})s'.format(tmpl)
+        self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict))
+
      def __forced_printings(self, info_dict, filename, incomplete):
          def print_mandatory(field, actual_field=None):
              if actual_field is None:
@@ -2648,15 +2700,10 @@ def print_optional(field):
          elif 'url' in info_dict:
              info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
  
-        if self.params.get('forceprint') or self.params.get('forcejson'):
+        if self.params['forceprint'].get('video') or self.params.get('forcejson'):
              self.post_extract(info_dict)
-        for tmpl in self.params.get('forceprint', []):
-            mobj = re.match(r'\w+(=?)$', tmpl)
-            if mobj and mobj.group(1):
-                tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s'
-            elif mobj:
-                tmpl = '%({})s'.format(tmpl)
-            self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict))
+        for tmpl in self.params['forceprint'].get('video', []):
+            self._forceprint(tmpl, info_dict)
  
          print_mandatory('title')
          print_mandatory('id')
@@ -2705,22 +2752,16 @@ def dl(self, name, info, subtitle=False, test=False):
          return fd.download(name, new_info, subtitle)
  
      def process_info(self, info_dict):
-        """Process a single resolved IE result."""
+        """Process a single resolved IE result. (Modified it in-place)"""
  
          assert info_dict.get('_type', 'video') == 'video'
-
-        max_downloads = self.params.get('max_downloads')
-        if max_downloads is not None:
-            if self._num_downloads >= int(max_downloads):
-                raise MaxDownloadsReached()
-
-        # TODO: backward compatibility, to be removed
-        info_dict['fulltitle'] = info_dict['title']
+        original_infodict = info_dict
  
          if 'format' not in info_dict and 'ext' in info_dict:
              info_dict['format'] = info_dict['ext']
  
          if self._match_entry(info_dict) is not None:
+            info_dict['__write_download_archive'] = 'ignore'
              return
  
          self.post_extract(info_dict)
@@ -2735,9 +2776,7 @@ def process_info(self, info_dict):
          self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
  
          if self.params.get('simulate'):
-            if self.params.get('force_write_download_archive', False):
-                self.record_download_archive(info_dict)
-            # Do nothing else if in simulate mode
+            info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
              return
  
          if full_filename is None:
@@ -2832,18 +2871,26 @@ def _write_link_file(link_type):
                 for link_type, should_write in write_links.items()):
              return
  
+        def replace_info_dict(new_info):
+            nonlocal info_dict
+            if new_info == info_dict:
+                return
+            info_dict.clear()
+            info_dict.update(new_info)
+
          try:
-            info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
+            new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move)
+            replace_info_dict(new_info)
          except PostProcessingError as err:
              self.report_error('Preprocessing: %s' % str(err))
              return
  
-        must_record_download_archive = False
-        if self.params.get('skip_download', False):
+        if self.params.get('skip_download'):
              info_dict['filepath'] = temp_filename
              info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename)))
              info_dict['__files_to_move'] = files_to_move
-            info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)
+            replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict))
+            info_dict['__write_download_archive'] = self.params.get('force_write_download_archive')
          else:
              # Download
              info_dict.setdefault('__postprocessors', [])
@@ -3062,7 +3109,7 @@ def ffmpeg_fixup(cndn, msg, cls):
  
                  fixup()
                  try:
-                    info_dict = self.post_process(dl_filename, info_dict, files_to_move)
+                    replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move))
                  except PostProcessingError as err:
                      self.report_error('Postprocessing: %s' % str(err))
                      return
@@ -3072,10 +3119,14 @@ def ffmpeg_fixup(cndn, msg, cls):
                  except Exception as err:
                      self.report_error('post hooks: %s' % str(err))
                      return
-                must_record_download_archive = True
+                info_dict['__write_download_archive'] = True
+
+        if self.params.get('force_write_download_archive'):
+            info_dict['__write_download_archive'] = True
+
+        # Make sure the info_dict was modified in-place
+        assert info_dict is original_infodict
  
-        if must_record_download_archive or self.params.get('force_write_download_archive', False):
-            self.record_download_archive(info_dict)
          max_downloads = self.params.get('max_downloads')
          if max_downloads is not None and self._num_downloads >= int(max_downloads):
              raise MaxDownloadsReached()
@@ -3145,8 +3196,8 @@ def sanitize_info(info_dict, remove_private_keys=False):
          keep_keys = ['_type']  # Always keep this to facilitate load-info-json
          if remove_private_keys:
              remove_keys |= {
-                'requested_formats', 'requested_subtitles', 'requested_entries', 'entries',
-                'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
+                'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
+                'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
              }
              reject = lambda k, v: k not in keep_keys and (
                  k.startswith('_') or k in remove_keys or v is None)
@@ -3225,13 +3276,12 @@ def pre_process(self, ie_info, key='pre_process', files_to_move=None):
              info = self.run_pp(pp, info)
          return info, info.pop('__files_to_move', None)
  
-    def post_process(self, filename, ie_info, files_to_move=None):
+    def post_process(self, filename, info, files_to_move=None):
          """Run all the postprocessors on the given file."""
-        info = dict(ie_info)
          info['filepath'] = filename
          info['__files_to_move'] = files_to_move or {}
  
-        for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']:
+        for pp in info.get('__postprocessors', []) + self._pps['post_process']:
              info = self.run_pp(pp, info)
          info = self.run_pp(MoveFilesAfterDownloadPP(self), info)
          del info['__files_to_move']
@@ -3276,6 +3326,7 @@ def record_download_archive(self, info_dict):
              return
          vid_id = self._make_archive_id(info_dict)
          assert vid_id
+        self.write_debug(f'Adding to archive: {vid_id}')
          with locked_file(fn, 'a', encoding='utf-8') as archive_file:
              archive_file.write(vid_id + '\n')
          self.archive.add(vid_id)
@@ -3724,9 +3775,10 @@ def _write_subtitles(self, info_dict, filename):
                  self.dl(sub_filename, sub_copy, subtitle=True)
                  sub_info['filepath'] = sub_filename
                  ret.append((sub_filename, sub_filename_final))
-            except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
+            except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err:
+                if self.params.get('ignoreerrors') is not True:  # False or 'only_download'
+                    raise DownloadError(f'Unable to download video subtitles for {sub_lang!r}: {err}', err)
                  self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}')
-                continue
          return ret
  
      def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None):