Make early reject of `--match-filter` stricter

[yt-dlp.git] / yt_dlp / YoutubeDL.py
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 58c5c47501d75baa5061b33442a5b042dde26fbb..525d3ab6e57e97212bda3a1752c9706c2cacc8b1 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -108,6 +108,7 @@
      get_domain,
      int_or_none,
      iri_to_uri,
+    is_path_like,
      join_nonempty,
      locked_file,
      make_archive_id,
@@ -251,8 +252,8 @@ class YoutubeDL:
      matchtitle:        Download only matching titles.
      rejecttitle:       Reject downloads for matching titles.
      logger:            Log messages to a logging.Logger instance.
-    logtostderr:       Log messages to stderr instead of stdout.
-    consoletitle:       Display progress in console window's titlebar.
+    logtostderr:       Print everything to stderr instead of stdout.
+    consoletitle:      Display progress in console window's titlebar.
      writedescription:  Write the video description to a .description file
      writeinfojson:     Write the video description to a .info.json file
      clean_infojson:    Remove private fields from the infojson
@@ -293,9 +294,8 @@ class YoutubeDL:
                         downloaded.
                         Videos without view count information are always
                         downloaded. None for no limit.
-    download_archive:  File name of a file where all downloads are recorded.
-                       Videos already present in the file are not downloaded
-                       again.
+    download_archive:  A set, or the name of a file where all downloads are recorded.
+                       Videos already present in the file are not downloaded again.
      break_on_existing: Stop the download process after attempting to download a
                         file that is in the archive.
      break_on_reject:   Stop the download process when encountering a video that
@@ -548,7 +548,7 @@ class YoutubeDL:
          # NB: Keep in sync with the docstring of extractor/common.py
          'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
          'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels',
-        'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
+        'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns',
          'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
          'preference', 'language', 'language_preference', 'quality', 'source_preference',
          'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options',
@@ -616,6 +616,30 @@ def __init__(self, params=None, auto_init=True):
                  '         If you experience any issues while using this option, '
                  f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report')
  
+        if self.params.get('bidi_workaround', False):
+            try:
+                import pty
+                master, slave = pty.openpty()
+                width = shutil.get_terminal_size().columns
+                width_args = [] if width is None else ['-w', str(width)]
+                sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
+                try:
+                    self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
+                except OSError:
+                    self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
+                self._output_channel = os.fdopen(master, 'rb')
+            except OSError as ose:
+                if ose.errno == errno.ENOENT:
+                    self.report_warning(
+                        'Could not find fribidi executable, ignoring --bidi-workaround. '
+                        'Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
+                else:
+                    raise
+
+        self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
+        if auto_init and auto_init != 'no_verbose_header':
+            self.print_debug_header()
+
          def check_deprecated(param, option, suggestion):
              if self.params.get(param) is not None:
                  self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
@@ -635,7 +659,6 @@ def check_deprecated(param, option, suggestion):
          for msg in self.params.get('_deprecation_warnings', []):
              self.deprecated_feature(msg)
  
-        self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
          if 'list-formats' in self.params['compat_opts']:
              self.params['listformats_table'] = False
  
@@ -649,6 +672,13 @@ def check_deprecated(param, option, suggestion):
          else:
              self.params['nooverwrites'] = not self.params['overwrites']
  
+        if self.params.get('simulate') is None and any((
+            self.params.get('list_thumbnails'),
+            self.params.get('listformats'),
+            self.params.get('listsubtitles'),
+        )):
+            self.params['simulate'] = 'list_only'
+
          self.params.setdefault('forceprint', {})
          self.params.setdefault('print_to_file', {})
  
@@ -656,29 +686,7 @@ def check_deprecated(param, option, suggestion):
          if not isinstance(params['forceprint'], dict):
              self.params['forceprint'] = {'video': params['forceprint']}
  
-        if self.params.get('bidi_workaround', False):
-            try:
-                import pty
-                master, slave = pty.openpty()
-                width = shutil.get_terminal_size().columns
-                width_args = [] if width is None else ['-w', str(width)]
-                sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error}
-                try:
-                    self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs)
-                except OSError:
-                    self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
-                self._output_channel = os.fdopen(master, 'rb')
-            except OSError as ose:
-                if ose.errno == errno.ENOENT:
-                    self.report_warning(
-                        'Could not find fribidi executable, ignoring --bidi-workaround. '
-                        'Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
-                else:
-                    raise
-
          if auto_init:
-            if auto_init != 'no_verbose_header':
-                self.print_debug_header()
              self.add_default_info_extractors()
  
          if (sys.platform != 'win32'
@@ -723,21 +731,23 @@ def check_deprecated(param, option, suggestion):
  
          def preload_download_archive(fn):
              """Preload the archive, if any is specified"""
+            archive = set()
              if fn is None:
-                return False
+                return archive
+            elif not is_path_like(fn):
+                return fn
+
              self.write_debug(f'Loading archive file {fn!r}')
              try:
                  with locked_file(fn, 'r', encoding='utf-8') as archive_file:
                      for line in archive_file:
-                        self.archive.add(line.strip())
+                        archive.add(line.strip())
              except OSError as ioe:
                  if ioe.errno != errno.ENOENT:
                      raise
-                return False
-            return True
+            return archive
  
-        self.archive = set()
-        preload_download_archive(self.params.get('download_archive'))
+        self.archive = preload_download_archive(self.params.get('download_archive'))
  
      def warn_if_short_id(self, argv):
          # short YouTube ID starting with dash?
@@ -844,7 +854,7 @@ def to_stdout(self, message, skip_eol=False, quiet=None):
                                       'Use "YoutubeDL.to_screen" instead')
          self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out)
  
-    def to_screen(self, message, skip_eol=False, quiet=None):
+    def to_screen(self, message, skip_eol=False, quiet=None, only_once=False):
          """Print message to screen if not in quiet mode"""
          if self.params.get('logger'):
              self.params['logger'].debug(message)
@@ -853,7 +863,7 @@ def to_screen(self, message, skip_eol=False, quiet=None):
              return
          self._write_string(
              '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')),
-            self._out_files.screen)
+            self._out_files.screen, only_once=only_once)
  
      def to_stderr(self, message, only_once=False):
          """Print message to stderr"""
@@ -1245,9 +1255,11 @@ def create_key(outer_mobj):
                  delim = '\n' if '#' in flags else ', '
                  value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt
              elif fmt[-1] == 'j':  # json
-                value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt
+                value, fmt = json.dumps(
+                    value, default=_dumpjson_default,
+                    indent=4 if '#' in flags else None, ensure_ascii='+' not in flags), str_fmt
              elif fmt[-1] == 'h':  # html
-                value, fmt = escapeHTML(value), str_fmt
+                value, fmt = escapeHTML(str(value)), str_fmt
              elif fmt[-1] == 'q':  # quoted
                  value = map(str, variadic(value) if '#' in flags else [value])
                  value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
@@ -1346,10 +1358,18 @@ def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
  
      def _match_entry(self, info_dict, incomplete=False, silent=False):
          """ Returns None if the file should be downloaded """
+        _type = info_dict.get('_type', 'video')
+        assert incomplete or _type == 'video', 'Only video result can be considered complete'
  
          video_title = info_dict.get('title', info_dict.get('id', 'entry'))
  
          def check_filter():
+            if _type in ('playlist', 'multi_video'):
+                return
+            elif _type in ('url', 'url_transparent') and not try_call(
+                    lambda: self.get_info_extractor(info_dict['ie_key']).is_single_video(info_dict['url'])):
+                return
+
              if 'title' in info_dict:
                  # This can happen when we're just evaluating the playlist
                  title = info_dict['title']
@@ -1419,18 +1439,19 @@ def add_extra_info(info_dict, extra_info):
      def extract_info(self, url, download=True, ie_key=None, extra_info=None,
                       process=True, force_generic_extractor=False):
          """
-        Return a list with a dictionary for each video extracted.
+        Extract and return the information dictionary of the URL
  
          Arguments:
-        url -- URL to extract
+        @param url          URL to extract
  
          Keyword arguments:
-        download -- whether to download videos during extraction
-        ie_key -- extractor key hint
-        extra_info -- dictionary containing the extra values to add to each result
-        process -- whether to resolve all unresolved references (URLs, playlist items),
-            must be True for download to work.
-        force_generic_extractor -- force using the generic extractor
+        @param download     Whether to download videos
+        @param process      Whether to resolve all unresolved references (URLs, playlist items).
+                            Must be True for download to work
+        @param ie_key       Use only the extractor with this key
+
+        @param extra_info   Dictionary containing the extra values to add to the info (For internal use only)
+        @force_generic_extractor  Force using the generic extractor (Deprecated; use ie_key='Generic')
          """
  
          if extra_info is None:
@@ -1616,6 +1637,7 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
                  self.add_default_extra_info(info_copy, ie, ie_result['url'])
                  self.add_extra_info(info_copy, extra_info)
                  info_copy, _ = self.pre_process(info_copy)
+                self._fill_common_fields(info_copy, False)
                  self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True)
                  self._raise_pending_errors(info_copy)
                  if self.params.get('force_write_download_archive', False):
@@ -1682,8 +1704,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
          elif result_type in ('playlist', 'multi_video'):
              # Protect from infinite recursion due to recursively nested playlists
              # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
-            webpage_url = ie_result['webpage_url']
-            if webpage_url in self._playlist_urls:
+            webpage_url = ie_result.get('webpage_url')  # Playlists maynot have webpage_url
+            if webpage_url and webpage_url in self._playlist_urls:
                  self.to_screen(
                      '[download] Skipping already downloaded playlist: %s'
                      % ie_result.get('title') or ie_result.get('id'))
@@ -1737,14 +1759,17 @@ def _playlist_infodict(ie_result, strict=False, **kwargs):
          }
          if strict:
              return info
+        if ie_result.get('webpage_url'):
+            info.update({
+                'webpage_url': ie_result['webpage_url'],
+                'webpage_url_basename': url_basename(ie_result['webpage_url']),
+                'webpage_url_domain': get_domain(ie_result['webpage_url']),
+            })
          return {
              **info,
              'playlist_index': 0,
              '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
              'extractor': ie_result['extractor'],
-            'webpage_url': ie_result['webpage_url'],
-            'webpage_url_basename': url_basename(ie_result['webpage_url']),
-            'webpage_url_domain': get_domain(ie_result['webpage_url']),
              'extractor_key': ie_result['extractor_key'],
          }
  
@@ -1799,7 +1824,7 @@ def __process_playlist(self, ie_result, download):
          elif self.params.get('playlistrandom'):
              random.shuffle(entries)
  
-        self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} videos'
+        self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} items'
                         f'{format_field(ie_result, "playlist_count", " of %s")}')
  
          keep_resolved_entries = self.params.get('extract_flat') != 'discard'
@@ -1832,7 +1857,7 @@ def __process_playlist(self, ie_result, download):
                  resolved_entries[i] = (playlist_index, NO_DEFAULT)
                  continue
  
-            self.to_screen('[download] Downloading video %s of %s' % (
+            self.to_screen('[download] Downloading item %s of %s' % (
                  self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
  
              extra.update({
@@ -1850,8 +1875,11 @@ def __process_playlist(self, ie_result, download):
                  resolved_entries[i] = (playlist_index, entry_result)
  
          # Update with processed data
-        ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT]
          ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT]
+        ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT]
+        if ie_result['requested_entries'] == try_call(lambda: list(range(1, ie_result['playlist_count'] + 1))):
+            # Do not set for full playlist
+            ie_result.pop('requested_entries')
  
          # Write the updated info to json
          if _infojson_written is True and self._write_info_json(
@@ -2371,10 +2399,9 @@ def check_thumbnails(thumbnails):
          else:
              info_dict['thumbnails'] = thumbnails
  
-    def _fill_common_fields(self, info_dict, is_video=True):
+    def _fill_common_fields(self, info_dict, final=True):
          # TODO: move sanitization here
-        if is_video:
-            # playlists are allowed to lack "title"
+        if final:
              title = info_dict.get('title', NO_DEFAULT)
              if title is NO_DEFAULT:
                  raise ExtractorError('Missing "title" field in extractor result',
@@ -2418,11 +2445,13 @@ def _fill_common_fields(self, info_dict, is_video=True):
              for key in live_keys:
                  if info_dict.get(key) is None:
                      info_dict[key] = (live_status == key)
+        if live_status == 'post_live':
+            info_dict['was_live'] = True
  
          # Auto generate title fields corresponding to the *_number fields when missing
          # in order to always have clean titles. This is very common for TV series.
          for field in ('chapter', 'season', 'episode'):
-            if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
+            if final and info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
                  info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
  
      def _raise_pending_errors(self, info):
@@ -2515,21 +2544,17 @@ def sanitize_numeric_fields(info):
          info_dict['requested_subtitles'] = self.process_subtitles(
              info_dict['id'], subtitles, automatic_captions)
  
-        if info_dict.get('formats') is None:
-            # There's only one format available
-            formats = [info_dict]
-        else:
-            formats = info_dict['formats']
+        formats = self._get_formats(info_dict)
  
          # or None ensures --clean-infojson removes it
          info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
          if not self.params.get('allow_unplayable_formats'):
              formats = [f for f in formats if not f.get('has_drm')]
-            if info_dict['_has_drm'] and formats and all(
-                    f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
-                self.report_warning(
-                    'This video is DRM protected and only images are available for download. '
-                    'Use --list-formats to see them')
+
+        if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
+            self.report_warning(
+                f'{"This video is DRM protected and " if info_dict["_has_drm"] else ""}'
+                'only images are available for download. Use --list-formats to see them'.capitalize())
  
          get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
          if not get_from_start:
@@ -2634,10 +2659,9 @@ def is_wellformed(f):
          info_dict, _ = self.pre_process(info_dict, 'after_filter')
  
          # The pre-processors may have modified the formats
-        formats = info_dict.get('formats', [info_dict])
+        formats = self._get_formats(info_dict)
  
-        list_only = self.params.get('simulate') is None and (
-            self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles'))
+        list_only = self.params.get('simulate') == 'list_only'
          interactive_format_selection = not list_only and self.format_selector == '-'
          if self.params.get('list_thumbnails'):
              self.list_thumbnails(info_dict)
@@ -2692,31 +2716,30 @@ def is_wellformed(f):
              # Process what we can, even without any available formats.
              formats_to_download = [{}]
  
-        requested_ranges = self.params.get('download_ranges')
-        if requested_ranges:
-            requested_ranges = tuple(requested_ranges(info_dict, self))
-
+        requested_ranges = tuple(self.params.get('download_ranges', lambda *_: [{}])(info_dict, self))
          best_format, downloaded_formats = formats_to_download[-1], []
          if download:
-            if best_format:
+            if best_format and requested_ranges:
                  def to_screen(*msg):
                      self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}')
  
                  to_screen(f'Downloading {len(formats_to_download)} format(s):',
                            (f['format_id'] for f in formats_to_download))
-                if requested_ranges:
+                if requested_ranges != ({}, ):
                      to_screen(f'Downloading {len(requested_ranges)} time ranges:',
-                              (f'{int(c["start_time"])}-{int(c["end_time"])}' for c in requested_ranges))
+                              (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges))
              max_downloads_reached = False
  
-            for fmt, chapter in itertools.product(formats_to_download, requested_ranges or [{}]):
+            for fmt, chapter in itertools.product(formats_to_download, requested_ranges):
                  new_info = self._copy_infodict(info_dict)
                  new_info.update(fmt)
                  offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf')
+                end_time = offset + min(chapter.get('end_time', duration), duration)
                  if chapter or offset:
                      new_info.update({
                          'section_start': offset + chapter.get('start_time', 0),
-                        'section_end': offset + min(chapter.get('end_time', duration), duration),
+                        # duration may not be accurate. So allow deviations <1sec
+                        'section_end': end_time if end_time <= offset + duration + 1 else None,
                          'section_title': chapter.get('title'),
                          'section_number': chapter.get('index'),
                      })
@@ -2813,13 +2836,16 @@ def _forceprint(self, key, info_dict):
          info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
  
          def format_tmpl(tmpl):
-            mobj = re.fullmatch(r'([\w.:,-]|(?P<dict>{[\w.:,-]+}))+=', tmpl)
+            mobj = re.fullmatch(r'([\w.:,]|-\d|(?P<dict>{([\w.:,]|-\d)+}))+=?', tmpl)
              if not mobj:
                  return tmpl
-            elif not mobj.group('dict'):
-                return '\n'.join(f'{f} = %({f})r' for f in tmpl[:-1].split(','))
-            tmpl = f'.{tmpl[:-1]}' if tmpl.startswith('{') else tmpl[:-1]
-            return f'{tmpl} = %({tmpl})#j'
+
+            fmt = '%({})s'
+            if tmpl.startswith('{'):
+                tmpl = f'.{tmpl}'
+            if tmpl.endswith('='):
+                tmpl, fmt = tmpl[:-1], '{0} = %({0})#j'
+            return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(',')))
  
          for tmpl in self.params['forceprint'].get(key, []):
              self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
@@ -3461,8 +3487,7 @@ def _make_archive_id(self, info_dict):
          return make_archive_id(extractor, video_id)
  
      def in_download_archive(self, info_dict):
-        fn = self.params.get('download_archive')
-        if fn is None:
+        if not self.archive:
              return False
  
          vid_ids = [self._make_archive_id(info_dict)]
@@ -3475,9 +3500,11 @@ def record_download_archive(self, info_dict):
              return
          vid_id = self._make_archive_id(info_dict)
          assert vid_id
+
          self.write_debug(f'Adding to archive: {vid_id}')
-        with locked_file(fn, 'a', encoding='utf-8') as archive_file:
-            archive_file.write(vid_id + '\n')
+        if is_path_like(fn):
+            with locked_file(fn, 'a', encoding='utf-8') as archive_file:
+                archive_file.write(vid_id + '\n')
          self.archive.add(vid_id)
  
      @staticmethod
@@ -3559,11 +3586,17 @@ def _format_note(self, fdict):
              res += '~' + format_bytes(fdict['filesize_approx'])
          return res
  
-    def render_formats_table(self, info_dict):
-        if not info_dict.get('formats') and not info_dict.get('url'):
-            return None
+    def _get_formats(self, info_dict):
+        if info_dict.get('formats') is None:
+            if info_dict.get('url') and info_dict.get('_type', 'video') == 'video':
+                return [info_dict]
+            return []
+        return info_dict['formats']
  
-        formats = info_dict.get('formats', [info_dict])
+    def render_formats_table(self, info_dict):
+        formats = self._get_formats(info_dict)
+        if not formats:
+            return
          if not self.params.get('listformats_table', True) is not False:
              table = [
                  [
@@ -3571,7 +3604,7 @@ def render_formats_table(self, info_dict):
                      format_field(f, 'ext'),
                      self.format_resolution(f),
                      self._format_note(f)
-                ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
+                ] for f in formats if (f.get('preference') or 0) >= -1000]
              return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1)
  
          def simplified_codec(f, field):
@@ -3630,7 +3663,7 @@ def render_thumbnails_table(self, info_dict):
              return None
          return render_table(
              self._list_format_headers('ID', 'Width', 'Height', 'URL'),
-            [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
+            [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails])
  
      def render_subtitles_table(self, video_id, subtitles):
          def _row(lang, formats):
@@ -3673,6 +3706,8 @@ def print_debug_header(self):
          if not self.params.get('verbose'):
              return
  
+        from . import _IN_CLI  # Must be delayed import
+
          # These imports can be slow. So import them only as needed
          from .extractor.extractors import _LAZY_LOADER
          from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors
@@ -3709,7 +3744,12 @@ def get_encoding(stream):
              __version__,
              f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
              '' if source == 'unknown' else f'({source})',
+            '' if _IN_CLI else 'API',
              delim=' '))
+
+        if not _IN_CLI:
+            write_debug(f'params: {self.params}')
+
          if not _LAZY_LOADER:
              if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
                  write_debug('Lazy loading extractors is forcibly disabled')