Add option `--use-extractors`

[yt-dlp.git] / yt_dlp / YoutubeDL.py
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 7a2b03cb50e37bf20a085f2598529329e4c70a06..a3d5620425ee62df6430f483b2d669f0f8695142 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -29,6 +29,7 @@
  from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
  from .downloader.rtmp import rtmpdump_version
  from .extractor import gen_extractor_classes, get_info_extractor
+from .extractor.common import UnsupportedURLIE
  from .extractor.openload import PhantomJSwrapper
  from .minicurses import format_text
  from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
@@ -115,6 +116,7 @@
      network_exceptions,
      number_of_digits,
      orderedSet,
+    orderedSet_from_options,
      parse_filesize,
      preferredencoding,
      prepend_extension,
@@ -236,7 +238,7 @@ class YoutubeDL:
                         Default is 'only_download' for CLI, but False for API
      skip_playlist_after_errors: Number of allowed failures until the rest of
                         the playlist is skipped
-    force_generic_extractor: Force downloader to use the generic extractor
+    allowed_extractors:  List of regexes to match against extractor names that are allowed
      overwrites:        Overwrite all video and metadata files if True,
                         overwrite only non-video files if None
                         and don't overwrite any file if False
@@ -444,6 +446,7 @@ class YoutubeDL:
                         * index: Section number (Optional)
      force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts
      noprogress:        Do not print the progress bar
+    live_from_start:   Whether to download livestreams videos from the start
  
      The following parameters are not used by YoutubeDL itself, they are used by
      the downloader (see yt_dlp/downloader/common.py):
@@ -475,6 +478,8 @@ class YoutubeDL:
  
      The following options are deprecated and may be removed in the future:
  
+    force_generic_extractor: Force downloader to use the generic extractor
+                       - Use allowed_extractors = ['generic', 'default']
      playliststart:     - Use playlist_items
                         Playlist item to start at.
      playlistend:       - Use playlist_items
@@ -756,13 +761,6 @@ def add_info_extractor(self, ie):
              self._ies_instances[ie_key] = ie
              ie.set_downloader(self)
  
-    def _get_info_extractor_class(self, ie_key):
-        ie = self._ies.get(ie_key)
-        if ie is None:
-            ie = get_info_extractor(ie_key)
-            self.add_info_extractor(ie)
-        return ie
-
      def get_info_extractor(self, ie_key):
          """
          Get an instance of an IE with name ie_key, it will try to get one from
@@ -779,8 +777,19 @@ def add_default_info_extractors(self):
          """
          Add the InfoExtractors returned by gen_extractors to the end of the list
          """
-        for ie in gen_extractor_classes():
-            self.add_info_extractor(ie)
+        all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()}
+        all_ies['end'] = UnsupportedURLIE()
+        try:
+            ie_names = orderedSet_from_options(
+                self.params.get('allowed_extractors', ['default']), {
+                    'all': list(all_ies),
+                    'default': [name for name, ie in all_ies.items() if ie._ENABLED],
+                }, use_regex=True)
+        except re.error as e:
+            raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}')
+        for name in ie_names:
+            self.add_info_extractor(all_ies[name])
+        self.write_debug(f'Loaded {len(ie_names)} extractors')
  
      def add_post_processor(self, pp, when='post_process'):
          """Add a PostProcessor object to the end of the chain."""
@@ -1411,11 +1420,11 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
              ie_key = 'Generic'
  
          if ie_key:
-            ies = {ie_key: self._get_info_extractor_class(ie_key)}
+            ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {}
          else:
              ies = self._ies
  
-        for ie_key, ie in ies.items():
+        for key, ie in ies.items():
              if not ie.suitable(url):
                  continue
  
@@ -1424,14 +1433,16 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
                                      'and will probably not work.')
  
              temp_id = ie.get_temp_id(url)
-            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
-                self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
+            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
+                self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive')
                  if self.params.get('break_on_existing', False):
                      raise ExistingVideoReached()
                  break
-            return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
+            return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
          else:
-            self.report_error('no suitable InfoExtractor for URL %s' % url)
+            extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])
+            self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}',
+                              tb=False if extractors_restricted else None)
  
      def _handle_extraction_exceptions(func):
          @functools.wraps(func)
@@ -2736,27 +2747,11 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
          if self.params.get('allsubtitles', False):
              requested_langs = all_sub_langs
          elif self.params.get('subtitleslangs', False):
-            # A list is used so that the order of languages will be the same as
-            # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
-            requested_langs = []
-            for lang_re in self.params.get('subtitleslangs'):
-                discard = lang_re[0] == '-'
-                if discard:
-                    lang_re = lang_re[1:]
-                if lang_re == 'all':
-                    if discard:
-                        requested_langs = []
-                    else:
-                        requested_langs.extend(all_sub_langs)
-                    continue
-                current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
-                if discard:
-                    for lang in current_langs:
-                        while lang in requested_langs:
-                            requested_langs.remove(lang)
-                else:
-                    requested_langs.extend(current_langs)
-            requested_langs = orderedSet(requested_langs)
+            try:
+                requested_langs = orderedSet_from_options(
+                    self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True)
+            except re.error as e:
+                raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}')
          elif normal_sub_langs:
              requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
          else:
@@ -3443,7 +3438,7 @@ def in_download_archive(self, info_dict):
              return False
  
          vid_ids = [self._make_archive_id(info_dict)]
-        vid_ids.extend(info_dict.get('_old_archive_ids', []))
+        vid_ids.extend(info_dict.get('_old_archive_ids') or [])
          return any(id_ in self.archive for id_ in vid_ids)
  
      def record_download_archive(self, info_dict):