Add pre-processor stage `after_filter`

author pukkandan <redacted>

Tue, 22 Feb 2022 11:43:30 +0000 (17:13 +0530)

committer pukkandan <redacted>

Tue, 22 Feb 2022 22:56:48 +0000 (04:26 +0530)
author pukkandan <redacted>
Tue, 22 Feb 2022 11:43:30 +0000 (17:13 +0530)
committer pukkandan <redacted>
Tue, 22 Feb 2022 22:56:48 +0000 (04:26 +0530)
diff --git a/README.md b/README.md

index 88ddb2f3b767fcaf3450d37fc164af1a513ab730..70b2e202fa426468dd752ceae5245495cc919e84 100644 (file)
--- a/README.md
+++ b/README.md
@@ -982,15 +982,17 @@ ## Post-Processing Options:
                                       semicolon ";" delimited list of NAME=VALUE.
                                       The "when" argument determines when the
                                       postprocessor is invoked. It can be one of
-                                     "pre_process" (after extraction),
-                                     "before_dl" (before video download),
-                                     "post_process" (after video download;
-                                     default), "after_move" (after moving file
-                                     to their final locations), "after_video"
-                                     (after downloading and processing all
-                                     formats of a video), or "playlist" (end of
-                                     playlist). This option can be used multiple
-                                     times to add different postprocessors
+                                     "pre_process" (after video extraction),
+                                     "after_filter" (after video passes filter),
+                                     "before_dl" (before each video download),
+                                     "post_process" (after each video download;
+                                     default), "after_move" (after moving video
+                                     file to it's final locations),
+                                     "after_video" (after downloading and
+                                     processing all formats of a video), or
+                                     "playlist" (at end of playlist). This
+                                     option can be used multiple times to add
+                                     different postprocessors
  
  ## SponsorBlock Options:
  Make chapter entries for, or remove various segments (sponsor,
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py

index 34ed814b42cd7e22055b7a2cb497f2f4e0dd1b5a..7637297be7499c6c590e709eb55f414b6031015c 100644 (file)
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -30,9 +30,7 @@ def __init__(self, *args, **kwargs):
          self.msgs = []
  
      def process_info(self, info_dict):
-        info_dict = info_dict.copy()
-        info_dict.pop('__original_infodict', None)
-        self.downloaded_info_dicts.append(info_dict)
+        self.downloaded_info_dicts.append(info_dict.copy())
  
      def to_screen(self, msg):
          self.msgs.append(msg)
@@ -898,20 +896,6 @@ def run(self, info):
          os.unlink(filename)
  
      def test_match_filter(self):
-        class FilterYDL(YDL):
-            def __init__(self, *args, **kwargs):
-                super(FilterYDL, self).__init__(*args, **kwargs)
-                self.params['simulate'] = True
-
-            def process_info(self, info_dict):
-                super(YDL, self).process_info(info_dict)
-
-            def _match_entry(self, info_dict, incomplete=False):
-                res = super(FilterYDL, self)._match_entry(info_dict, incomplete)
-                if res is None:
-                    self.downloaded_info_dicts.append(info_dict.copy())
-                return res
-
          first = {
              'id': '1',
              'url': TEST_URL,
@@ -939,7 +923,7 @@ def _match_entry(self, info_dict, incomplete=False):
          videos = [first, second]
  
          def get_videos(filter_=None):
-            ydl = FilterYDL({'match_filter': filter_})
+            ydl = YDL({'match_filter': filter_, 'simulate': True})
              for v in videos:
                  ydl.process_ie_result(v, download=True)
              return [v['id'] for v in ydl.downloaded_info_dicts]
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 36b2b37c0f447d475f7eb85f38c417adf438db4b..d9a3c0bcef69153b4b672678ed9ed5fc374b0ea7 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -1037,8 +1037,7 @@ def validate_outtmpl(cls, outtmpl):
      @staticmethod
      def _copy_infodict(info_dict):
          info_dict = dict(info_dict)
-        for key in ('__original_infodict', '__postprocessors'):
-            info_dict.pop(key, None)
+        info_dict.pop('__postprocessors', None)
          return info_dict
  
      def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
@@ -2512,8 +2511,6 @@ def is_wellformed(f):
          if '__x_forwarded_for_ip' in info_dict:
              del info_dict['__x_forwarded_for_ip']
  
-        # TODO Central sorting goes here
-
          if self.params.get('check_formats') is True:
              formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
  
@@ -2526,6 +2523,12 @@ def is_wellformed(f):
  
          info_dict, _ = self.pre_process(info_dict)
  
+        if self._match_entry(info_dict) is not None:
+            return info_dict
+
+        self.post_extract(info_dict)
+        info_dict, _ = self.pre_process(info_dict, 'after_filter')
+
          # The pre-processors may have modified the formats
          formats = info_dict.get('formats', [info_dict])
  
@@ -2610,15 +2613,12 @@ def is_wellformed(f):
                      + ', '.join([f['format_id'] for f in formats_to_download]))
              max_downloads_reached = False
              for i, fmt in enumerate(formats_to_download):
-                formats_to_download[i] = new_info = dict(info_dict)
-                # Save a reference to the original info_dict so that it can be modified in process_info if needed
+                formats_to_download[i] = new_info = self._copy_infodict(info_dict)
                  new_info.update(fmt)
-                new_info['__original_infodict'] = info_dict
                  try:
                      self.process_info(new_info)
                  except MaxDownloadsReached:
                      max_downloads_reached = True
-                new_info.pop('__original_infodict')
                  # Remove copied info
                  for key, val in tuple(new_info.items()):
                      if info_dict.get(key) == val:
@@ -2826,7 +2826,7 @@ def existing_file(self, filepaths, *, default_overwrite=True):
          return None
  
      def process_info(self, info_dict):
-        """Process a single resolved IE result. (Modified it in-place)"""
+        """Process a single resolved IE result. (Modifies it in-place)"""
  
          assert info_dict.get('_type', 'video') == 'video'
          original_infodict = info_dict
@@ -2834,18 +2834,22 @@ def process_info(self, info_dict):
          if 'format' not in info_dict and 'ext' in info_dict:
              info_dict['format'] = info_dict['ext']
  
+        # This is mostly just for backward compatibility of process_info
+        # As a side-effect, this allows for format-specific filters
          if self._match_entry(info_dict) is not None:
              info_dict['__write_download_archive'] = 'ignore'
              return
  
+        # Does nothing under normal operation - for backward compatibility of process_info
          self.post_extract(info_dict)
-        self._num_downloads += 1
  
          # info_dict['_filename'] needs to be set for backward compatibility
          info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True)
          temp_filename = self.prepare_filename(info_dict, 'temp')
          files_to_move = {}
  
+        self._num_downloads += 1
+
          # Forced printings
          self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict))
  
@@ -3259,17 +3263,14 @@ def sanitize_info(info_dict, remove_private_keys=False):
              return info_dict
          info_dict.setdefault('epoch', int(time.time()))
          info_dict.setdefault('_type', 'video')
-        remove_keys = {'__original_infodict'}  # Always remove this since this may contain a copy of the entire dict
-        keep_keys = ['_type']  # Always keep this to facilitate load-info-json
+
          if remove_private_keys:
-            remove_keys |= {
+            reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in {
                  'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
                  'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber',
              }
-            reject = lambda k, v: k not in keep_keys and (
-                k.startswith('_') or k in remove_keys or v is None)
          else:
-            reject = lambda k, v: k in remove_keys
+            reject = lambda k, v: False
  
          def filter_fn(obj):
              if isinstance(obj, dict):
@@ -3296,14 +3297,8 @@ def actual_post_extract(info_dict):
                      actual_post_extract(video_dict or {})
                  return
  
-            post_extractor = info_dict.get('__post_extractor') or (lambda: {})
-            extra = post_extractor().items()
-            info_dict.update(extra)
-            info_dict.pop('__post_extractor', None)
-
-            original_infodict = info_dict.get('__original_infodict') or {}
-            original_infodict.update(extra)
-            original_infodict.pop('__post_extractor', None)
+            post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {})
+            info_dict.update(post_extractor())
  
          actual_post_extract(info_dict or {})
  
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py

index b93f47ecc9be42c101b9603266a45b778f82a93c..c87c5b6df9df0f30b79c9605509e33acc8d5bfe2 100644 (file)
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -474,8 +474,8 @@ def report_unplayable_conflict(opt_name, arg, default=False, allowed=None):
              'key': 'SponsorBlock',
              'categories': sponsorblock_query,
              'api': opts.sponsorblock_api,
-            # Run this immediately after extraction is complete
-            'when': 'pre_process'
+            # Run this after filtering videos
+            'when': 'after_filter'
          })
      if opts.parse_metadata:
          postprocessors.append({
diff --git a/yt_dlp/options.py b/yt_dlp/options.py

index 2ba7d2601bb311cf9e73403a6f079c69ca365122..6fcef98cd99aed420356373d46bcfd81b3a81b9d 100644 (file)
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -1550,11 +1550,11 @@ def _dict_from_options_callback(
              'and (optionally) arguments to be passed to it, separated by a colon ":". '
              'ARGS are a semicolon ";" delimited list of NAME=VALUE. '
              'The "when" argument determines when the postprocessor is invoked. '
-            'It can be one of "pre_process" (after extraction), '
-            '"before_dl" (before video download), "post_process" (after video download; default), '
-            '"after_move" (after moving file to their final locations), '
+            'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), '
+            '"before_dl" (before each video download), "post_process" (after each video download; default), '
+            '"after_move" (after moving video file to it\'s final locations), '
              '"after_video" (after downloading and processing all formats of a video), '
-            'or "playlist" (end of playlist). '
+            'or "playlist" (at end of playlist). '
              'This option can be used multiple times to add different postprocessors'))
  
      sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=(
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index f5cad0e541676f5ce0e65681bf1f4abd187e8ca2..8b0d95efada1b6dd1f7f143f4a8af9e6996722c5 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3166,7 +3166,7 @@ def q(qid):
      return q
  
  
-POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
+POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
  
  
  DEFAULT_OUTTMPL = {
author	pukkandan <redacted>
	Tue, 22 Feb 2022 11:43:30 +0000 (17:13 +0530)
committer	pukkandan <redacted>
	Tue, 22 Feb 2022 22:56:48 +0000 (04:26 +0530)
README.md		patch \| blob \| blame \| history
test/test_YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/YoutubeDL.py		patch \| blob \| blame \| history
yt_dlp/__init__.py		patch \| blob \| blame \| history
yt_dlp/options.py		patch \| blob \| blame \| history
yt_dlp/utils.py		patch \| blob \| blame \| history