Add option `--break-match-filters`

[yt-dlp.git] / yt_dlp / YoutubeDL.py
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 9ef56a46b69e863b45b4922961da0328b4049579..5d21b43cf7e5a791ed29232e3171f32c342b7ddf 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -33,7 +33,7 @@
  from .extractor.openload import PhantomJSwrapper
  from .minicurses import format_text
  from .plugins import directories as plugin_directories
-from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
+from .postprocessor import _PLUGIN_CLASSES as plugin_pps
  from .postprocessor import (
      EmbedThumbnailPP,
      FFmpegFixupDuplicateMoovPP,
@@ -150,7 +150,7 @@
      write_json_file,
      write_string,
  )
-from .version import RELEASE_GIT_HEAD, VARIANT, __version__
+from .version import CHANNEL, RELEASE_GIT_HEAD, VARIANT, __version__
  
  if compat_os_name == 'nt':
      import ctypes
@@ -300,8 +300,6 @@ class YoutubeDL:
                         Videos already present in the file are not downloaded again.
      break_on_existing: Stop the download process after attempting to download a
                         file that is in the archive.
-    break_on_reject:   Stop the download process when encountering a video that
-                       has been filtered out.
      break_per_url:     Whether break_on_reject and break_on_existing
                         should act on each input URL as opposed to for the entire queue
      cookiefile:        File name or text stream from where cookies should be read and dumped to
@@ -318,6 +316,7 @@ class YoutubeDL:
                          If not provided and the key is encrypted, yt-dlp will ask interactively
      prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
                         (Only supported by some extractors)
+    enable_file_urls:  Enable file:// URLs. This is disabled by default for security reasons.
      http_headers:      A dictionary of custom headers to be used for all requests
      proxy:             URL of the proxy server to use
      geo_verification_proxy:  URL of the proxy to use for IP address verification
@@ -413,6 +412,8 @@ class YoutubeDL:
                         - If it returns None, the video is downloaded.
                         - If it returns utils.NO_DEFAULT, the user is interactively
                           asked whether to download the video.
+                       - Raise utils.DownloadCancelled(msg) to abort remaining
+                         downloads when a video is rejected.
                         match_filter_func in utils.py is one example for this.
      no_color:          Do not emit color codes in output.
      geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
@@ -482,6 +483,9 @@ class YoutubeDL:
  
      The following options are deprecated and may be removed in the future:
  
+    break_on_reject:   Stop the download process when encountering a video that
+                       has been filtered out.
+                       - `raise DownloadCancelled(msg)` in match_filter instead
      force_generic_extractor: Force downloader to use the generic extractor
                         - Use allowed_extractors = ['generic', 'default']
      playliststart:     - Use playlist_items
@@ -553,7 +557,7 @@ class YoutubeDL:
          'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
          'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
          'preference', 'language', 'language_preference', 'quality', 'source_preference',
-        'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
+        'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'hls_aes', 'downloader_options',
          'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
      }
      _format_selection_exts = {
@@ -585,7 +589,6 @@ def __init__(self, params=None, auto_init=True):
          self._playlist_urls = set()
          self.cache = Cache(self)
  
-        windows_enable_vt_mode()
          stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout
          self._out_files = Namespace(
              out=stdout,
@@ -594,6 +597,12 @@ def __init__(self, params=None, auto_init=True):
              console=None if compat_os_name == 'nt' else next(
                  filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None)
          )
+
+        try:
+            windows_enable_vt_mode()
+        except Exception as e:
+            self.write_debug(f'Failed to enable VT mode: {e}')
+
          self._allow_colors = Namespace(**{
              type_: not self.params.get('no_color') and supports_terminal_sequences(stream)
              for type_, stream in self._out_files.items_ if type_ != 'console'
@@ -608,7 +617,7 @@ def __init__(self, params=None, auto_init=True):
                     '\n                    You will no longer receive updates on this version')
              if current_version < MIN_SUPPORTED:
                  msg = 'Python version %d.%d is no longer supported'
-            self.deprecation_warning(
+            self.deprecated_feature(
                  f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED))
  
          if self.params.get('allow_unplayable_formats'):
@@ -1401,31 +1410,44 @@ def check_filter():
                  return 'Skipping "%s" because it is age restricted' % video_title
  
              match_filter = self.params.get('match_filter')
-            if match_filter is not None:
+            if match_filter is None:
+                return None
+
+            cancelled = None
+            try:
                  try:
                      ret = match_filter(info_dict, incomplete=incomplete)
                  except TypeError:
                      # For backward compatibility
                      ret = None if incomplete else match_filter(info_dict)
-                if ret is NO_DEFAULT:
-                    while True:
-                        filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
-                        reply = input(self._format_screen(
-                            f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
-                        if reply in {'y', ''}:
-                            return None
-                        elif reply == 'n':
-                            return f'Skipping {video_title}'
-                elif ret is not None:
-                    return ret
-            return None
+            except DownloadCancelled as err:
+                if err.msg is not NO_DEFAULT:
+                    raise
+                ret, cancelled = err.msg, err
+
+            if ret is NO_DEFAULT:
+                while True:
+                    filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME)
+                    reply = input(self._format_screen(
+                        f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip()
+                    if reply in {'y', ''}:
+                        return None
+                    elif reply == 'n':
+                        if cancelled:
+                            raise type(cancelled)(f'Skipping {video_title}')
+                        return f'Skipping {video_title}'
+            return ret
  
          if self.in_download_archive(info_dict):
              reason = '%s has already been recorded in the archive' % video_title
              break_opt, break_err = 'break_on_existing', ExistingVideoReached
          else:
-            reason = check_filter()
-            break_opt, break_err = 'break_on_reject', RejectedVideoReached
+            try:
+                reason = check_filter()
+            except DownloadCancelled as e:
+                reason, break_opt, break_err = e.msg, 'match_filter', type(e)
+            else:
+                break_opt, break_err = 'break_on_reject', RejectedVideoReached
          if reason is not None:
              if not silent:
                  self.to_screen('[download] ' + reason)
@@ -1771,7 +1793,7 @@ def _playlist_infodict(ie_result, strict=False, **kwargs):
          return {
              **info,
              'playlist_index': 0,
-            '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
+            '__last_playlist_index': max(ie_result.get('requested_entries') or (0, 0)),
              'extractor': ie_result['extractor'],
              'extractor_key': ie_result['extractor_key'],
          }
@@ -2405,11 +2427,7 @@ def check_thumbnails(thumbnails):
      def _fill_common_fields(self, info_dict, final=True):
          # TODO: move sanitization here
          if final:
-            title = info_dict.get('title', NO_DEFAULT)
-            if title is NO_DEFAULT:
-                raise ExtractorError('Missing "title" field in extractor result',
-                                     video_id=info_dict['id'], ie=info_dict['extractor'])
-            info_dict['fulltitle'] = title
+            title = info_dict['fulltitle'] = info_dict.get('title')
              if not title:
                  if title == '':
                      self.write_debug('Extractor gave empty title. Creating a generic title')
@@ -2464,15 +2482,8 @@ def _raise_pending_errors(self, info):
  
      def sort_formats(self, info_dict):
          formats = self._get_formats(info_dict)
-        if not formats:
-            return
-        # Backward compatibility with InfoExtractor._sort_formats
-        field_preference = formats[0].pop('__sort_fields', None)
-        if field_preference:
-            info_dict['_format_sort_fields'] = field_preference
-
          formats.sort(key=FormatSorter(
-            self, info_dict.get('_format_sort_fields', [])).calculate_preference)
+            self, info_dict.get('_format_sort_fields') or []).calculate_preference)
  
      def process_video_result(self, info_dict, download=True):
          assert info_dict.get('_type', 'video') == 'video'
@@ -2559,9 +2570,13 @@ def sanitize_numeric_fields(info):
          info_dict['requested_subtitles'] = self.process_subtitles(
              info_dict['id'], subtitles, automatic_captions)
  
-        self.sort_formats(info_dict)
          formats = self._get_formats(info_dict)
  
+        # Backward compatibility with InfoExtractor._sort_formats
+        field_preference = (formats or [{}])[0].pop('__sort_fields', None)
+        if field_preference:
+            info_dict['_format_sort_fields'] = field_preference
+
          # or None ensures --clean-infojson removes it
          info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
          if not self.params.get('allow_unplayable_formats'):
@@ -2599,22 +2614,43 @@ def is_wellformed(f):
          if not formats:
              self.raise_no_formats(info_dict)
  
-        formats_dict = {}
-
-        # We check that all the formats have the format and format_id fields
-        for i, format in enumerate(formats):
+        for format in formats:
              sanitize_string_field(format, 'format_id')
              sanitize_numeric_fields(format)
              format['url'] = sanitize_url(format['url'])
+            if format.get('ext') is None:
+                format['ext'] = determine_ext(format['url']).lower()
+            if format.get('protocol') is None:
+                format['protocol'] = determine_protocol(format)
+            if format.get('resolution') is None:
+                format['resolution'] = self.format_resolution(format, default=None)
+            if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
+                format['dynamic_range'] = 'SDR'
+            if format.get('aspect_ratio') is None:
+                format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2))
+            if (info_dict.get('duration') and format.get('tbr')
+                    and not format.get('filesize') and not format.get('filesize_approx')):
+                format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
+            format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict))
+
+        # This is copied to http_headers by the above _calc_headers and can now be removed
+        if '__x_forwarded_for_ip' in info_dict:
+            del info_dict['__x_forwarded_for_ip']
+
+        self.sort_formats({
+            'formats': formats,
+            '_format_sort_fields': info_dict.get('_format_sort_fields')
+        })
+
+        # Sanitize and group by format_id
+        formats_dict = {}
+        for i, format in enumerate(formats):
              if not format.get('format_id'):
                  format['format_id'] = str(i)
              else:
                  # Sanitize format_id from characters used in format selector expression
                  format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
-            format_id = format['format_id']
-            if format_id not in formats_dict:
-                formats_dict[format_id] = []
-            formats_dict[format_id].append(format)
+            formats_dict.setdefault(format['format_id'], []).append(format)
  
          # Make sure all formats have unique format_id
          common_exts = set(itertools.chain(*self._format_selection_exts.values()))
@@ -2623,40 +2659,17 @@ def is_wellformed(f):
              for i, format in enumerate(ambiguous_formats):
                  if ambigious_id:
                      format['format_id'] = '%s-%d' % (format_id, i)
-                if format.get('ext') is None:
-                    format['ext'] = determine_ext(format['url']).lower()
                  # Ensure there is no conflict between id and ext in format selection
                  # See https://github.com/yt-dlp/yt-dlp/issues/1282
                  if format['format_id'] != format['ext'] and format['format_id'] in common_exts:
                      format['format_id'] = 'f%s' % format['format_id']
  
-        for i, format in enumerate(formats):
-            if format.get('format') is None:
-                format['format'] = '{id} - {res}{note}'.format(
-                    id=format['format_id'],
-                    res=self.format_resolution(format),
-                    note=format_field(format, 'format_note', ' (%s)'),
-                )
-            if format.get('protocol') is None:
-                format['protocol'] = determine_protocol(format)
-            if format.get('resolution') is None:
-                format['resolution'] = self.format_resolution(format, default=None)
-            if format.get('dynamic_range') is None and format.get('vcodec') != 'none':
-                format['dynamic_range'] = 'SDR'
-            if format.get('aspect_ratio') is None:
-                format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2))
-            if (info_dict.get('duration') and format.get('tbr')
-                    and not format.get('filesize') and not format.get('filesize_approx')):
-                format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
-
-            # Add HTTP headers, so that external programs can use them from the
-            # json output
-            full_format_info = info_dict.copy()
-            full_format_info.update(format)
-            format['http_headers'] = self._calc_headers(full_format_info)
-        # Remove private housekeeping stuff
-        if '__x_forwarded_for_ip' in info_dict:
-            del info_dict['__x_forwarded_for_ip']
+                if format.get('format') is None:
+                    format['format'] = '{id} - {res}{note}'.format(
+                        id=format['format_id'],
+                        res=self.format_resolution(format),
+                        note=format_field(format, 'format_note', ' (%s)'),
+                    )
  
          if self.params.get('check_formats') is True:
              formats = LazyList(self._check_formats(formats[::-1]), reverse=True)
@@ -2813,10 +2826,14 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
                      self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True)
              except re.error as e:
                  raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}')
-        elif normal_sub_langs:
-            requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
          else:
-            requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1]
+            requested_langs = LazyList(itertools.chain(
+                ['en'] if 'en' in normal_sub_langs else [],
+                filter(lambda f: f.startswith('en'), normal_sub_langs),
+                ['en'] if 'en' in all_sub_langs else [],
+                filter(lambda f: f.startswith('en'), all_sub_langs),
+                normal_sub_langs, all_sub_langs,
+            ))[:1]
          if requested_langs:
              self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}')
  
@@ -3391,6 +3408,7 @@ def sanitize_info(info_dict, remove_private_keys=False):
              reject = lambda k, v: v is None or k.startswith('__') or k in {
                  'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries',
                  'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber',
+                '_format_sort_fields',
              }
          else:
              reject = lambda k, v: False
@@ -3460,7 +3478,8 @@ def run_pp(self, pp, infodict):
          return infodict
  
      def run_all_pps(self, key, info, *, additional_pps=None):
-        self._forceprint(key, info)
+        if key != 'video':
+            self._forceprint(key, info)
          for pp in (additional_pps or []) + self._pps[key]:
              info = self.run_pp(pp, info)
          return info
@@ -3662,6 +3681,7 @@ def simplified_codec(f, field):
                  format_field(f, 'asr', '\t%s', func=format_decimal_suffix),
                  join_nonempty(
                      self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None,
+                    self._format_out('DRM', 'light red') if f.get('has_drm') else None,
                      format_field(f, 'language', '[%s]'),
                      join_nonempty(format_field(f, 'format_note'),
                                    format_field(f, 'container', ignore=(None, f.get('ext'))),
@@ -3729,7 +3749,10 @@ def print_debug_header(self):
  
          # These imports can be slow. So import them only as needed
          from .extractor.extractors import _LAZY_LOADER
-        from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors
+        from .extractor.extractors import (
+            _PLUGIN_CLASSES as plugin_ies,
+            _PLUGIN_OVERRIDES as plugin_ie_overrides
+        )
  
          def get_encoding(stream):
              ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
@@ -3758,12 +3781,13 @@ def get_encoding(stream):
          source = detect_variant()
          if VARIANT not in (None, 'pip'):
              source += '*'
+        klass = type(self)
          write_debug(join_nonempty(
              f'{"yt-dlp" if REPOSITORY == "yt-dlp/yt-dlp" else REPOSITORY} version',
-            __version__,
-            f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
+            __version__ + {'stable': '', 'nightly': '*'}.get(CHANNEL, f' <{CHANNEL}>'),
+            f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '',
              '' if source == 'unknown' else f'({source})',
-            '' if _IN_CLI else 'API',
+            '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}',
              delim=' '))
  
          if not _IN_CLI:
@@ -3807,12 +3831,17 @@ def get_encoding(stream):
                  proxy_map.update(handler.proxies)
          write_debug(f'Proxy map: {proxy_map}')
  
-        for plugin_type, plugins in {'Extractor': plugin_extractors, 'Post-Processor': plugin_postprocessors}.items():
-            if not plugins:
-                continue
-            write_debug(f'{plugin_type} Plugins: %s' % (', '.join(sorted(('%s%s' % (
+        for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items():
+            display_list = ['%s%s' % (
                  klass.__name__, '' if klass.__name__ == name else f' as {name}')
-                for name, klass in plugins.items())))))
+                for name, klass in plugins.items()]
+            if plugin_type == 'Extractor':
+                display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})'
+                                    for parent, plugins in plugin_ie_overrides.items())
+            if not display_list:
+                continue
+            write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}')
+
          plugin_dirs = plugin_directories()
          if plugin_dirs:
              write_debug(f'Plugin directories: {plugin_dirs}')
@@ -3866,9 +3895,12 @@ def _setup_opener(self):
          # https://github.com/ytdl-org/youtube-dl/issues/8227)
          file_handler = urllib.request.FileHandler()
  
-        def file_open(*args, **kwargs):
-            raise urllib.error.URLError('file:// scheme is explicitly disabled in yt-dlp for security reasons')
-        file_handler.file_open = file_open
+        if not self.params.get('enable_file_urls'):
+            def file_open(*args, **kwargs):
+                raise urllib.error.URLError(
+                    'file:// URLs are explicitly disabled in yt-dlp for security reasons. '
+                    'Use --enable-file-urls to enable at your own risk.')
+            file_handler.file_open = file_open
  
          opener = urllib.request.build_opener(
              proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
@@ -3930,7 +3962,7 @@ def _write_description(self, label, ie_result, descfn):
          elif not self.params.get('overwrites', True) and os.path.exists(descfn):
              self.to_screen(f'[info] {label.title()} description is already present')
          elif ie_result.get('description') is None:
-            self.report_warning(f'There\'s no {label} description to write')
+            self.to_screen(f'[info] There\'s no {label} description to write')
              return False
          else:
              try:
@@ -3946,15 +3978,18 @@ def _write_subtitles(self, info_dict, filename):
          ''' Write subtitles to file and return list of (sub_filename, final_sub_filename); or None if error'''
          ret = []
          subtitles = info_dict.get('requested_subtitles')
-        if not subtitles or not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
+        if not (self.params.get('writesubtitles') or self.params.get('writeautomaticsub')):
              # subtitles download errors are already managed as troubles in relevant IE
              # that way it will silently go on when used with unsupporting IE
              return ret
-
+        elif not subtitles:
+            self.to_screen('[info] There\'s no subtitles for the requested languages')
+            return ret
          sub_filename_base = self.prepare_filename(info_dict, 'subtitle')
          if not sub_filename_base:
              self.to_screen('[info] Skipping writing video subtitles')
              return ret
+
          for sub_lang, sub_info in subtitles.items():
              sub_format = sub_info['ext']
              sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
@@ -4001,6 +4036,9 @@ def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None
          thumbnails, ret = [], []
          if write_all or self.params.get('writethumbnail', False):
              thumbnails = info_dict.get('thumbnails') or []
+            if not thumbnails:
+                self.to_screen(f'[info] There\'s no {label} thumbnails to download')
+                return ret
          multiple = write_all and len(thumbnails) > 1
  
          if thumb_filename_base is None: