[cleanup] Misc

[yt-dlp.git] / yt_dlp / YoutubeDL.py
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 0f8a51dbe3b899e1e4656ee6c89b07e63894cb12..c2b306d70c30cba90aa920a8beae1ed6fafa71fb 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -52,6 +52,7 @@
      DEFAULT_OUTTMPL,
      IDENTITY,
      LINK_TEMPLATES,
+    MEDIA_EXTENSIONS,
      NO_DEFAULT,
      NUMBER_RE,
      OUTTMPL_TYPES,
@@ -80,6 +81,7 @@
      RejectedVideoReached,
      SameFileError,
      UnavailableVideoError,
+    UserNotLive,
      YoutubeDLCookieProcessor,
      YoutubeDLHandler,
      YoutubeDLRedirectHandler,
@@ -100,11 +102,13 @@
      format_decimal_suffix,
      format_field,
      formatSeconds,
+    get_compatible_ext,
      get_domain,
      int_or_none,
      iri_to_uri,
      join_nonempty,
      locked_file,
+    make_archive_id,
      make_dir,
      make_HTTPS_handler,
      merge_headers,
@@ -131,6 +135,7 @@
      timetuple_from_msec,
      to_high_limit_path,
      traverse_obj,
+    try_call,
      try_get,
      url_basename,
      variadic,
@@ -139,7 +144,7 @@
      write_json_file,
      write_string,
  )
-from .version import RELEASE_GIT_HEAD, __version__
+from .version import RELEASE_GIT_HEAD, VARIANT, __version__
  
  if compat_os_name == 'nt':
      import ctypes
@@ -267,7 +272,7 @@ class YoutubeDL:
      subtitleslangs:    List of languages of the subtitles to download (can be regex).
                         The list may contain "all" to refer to all the available
                         subtitles. The language can be prefixed with a "-" to
-                       exclude it from the requested languages. Eg: ['all', '-live_chat']
+                       exclude it from the requested languages, e.g. ['all', '-live_chat']
      keepvideo:         Keep the video file after post-processing
      daterange:         A DateRange object, download only if the upload_date is in the range.
      skip_download:     Skip the actual download of the video file
@@ -296,8 +301,8 @@ class YoutubeDL:
                         should act on each input URL as opposed to for the entire queue
      cookiefile:        File name or text stream from where cookies should be read and dumped to
      cookiesfrombrowser:  A tuple containing the name of the browser, the profile
-                       name/pathfrom where cookies are loaded, and the name of the
-                       keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
+                       name/path from where cookies are loaded, and the name of the
+                       keyring, e.g. ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
      legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
                         support RFC 5746 secure renegotiation
      nocheckcertificate:  Do not verify SSL certificates
@@ -306,7 +311,7 @@ class YoutubeDL:
      client_certificate_password:  Password for client certificate private key, if encrypted.
                          If not provided and the key is encrypted, yt-dlp will ask interactively
      prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
-                       At the moment, this is only supported by YouTube.
+                       (Only supported by some extractors)
      http_headers:      A dictionary of custom headers to be used for all requests
      proxy:             URL of the proxy server to use
      geo_verification_proxy:  URL of the proxy to use for IP address verification
@@ -369,7 +374,7 @@ class YoutubeDL:
  
                         Progress hooks are guaranteed to be called at least twice
                         (with status "started" and "finished") if the processing is successful.
-    merge_output_format: Extension to use when merging formats.
+    merge_output_format: "/" separated list of extensions to use when merging formats.
      final_ext:         Expected final extension; used to detect when the file was
                         already downloaded and converted
      fixup:             Automatically correct known faults of the file.
@@ -439,6 +444,7 @@ class YoutubeDL:
                         * index: Section number (Optional)
      force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts
      noprogress:        Do not print the progress bar
+    live_from_start:   Whether to download livestreams videos from the start
  
      The following parameters are not used by YoutubeDL itself, they are used by
      the downloader (see yt_dlp/downloader/common.py):
@@ -465,7 +471,7 @@ class YoutubeDL:
                         discontinuities such as ad breaks (default: False)
      extractor_args:    A dictionary of arguments to be passed to the extractors.
                         See "EXTRACTOR ARGUMENTS" for details.
-                       Eg: {'youtube': {'skip': ['dash', 'hls']}}
+                       E.g. {'youtube': {'skip': ['dash', 'hls']}}
      mark_watched:      Mark videos watched (even with --simulate). Only for YouTube
  
      The following options are deprecated and may be removed in the future:
@@ -522,7 +528,8 @@ class YoutubeDL:
      """
  
      _NUMERIC_FIELDS = {
-        'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
+        'width', 'height', 'asr', 'audio_channels', 'fps',
+        'tbr', 'abr', 'vbr', 'filesize', 'filesize_approx',
          'timestamp', 'release_timestamp',
          'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
          'average_rating', 'comment_count', 'age_limit',
@@ -534,7 +541,7 @@ class YoutubeDL:
      _format_fields = {
          # NB: Keep in sync with the docstring of extractor/common.py
          'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note',
-        'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr',
+        'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels',
          'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx',
          'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start',
          'preference', 'language', 'language_preference', 'quality', 'source_preference',
@@ -542,9 +549,9 @@ class YoutubeDL:
          'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time'
      }
      _format_selection_exts = {
-        'audio': {'m4a', 'mp3', 'ogg', 'aac'},
-        'video': {'mp4', 'flv', 'webm', '3gp'},
-        'storyboards': {'mhtml'},
+        'audio': set(MEDIA_EXTENSIONS.common_audio),
+        'video': set(MEDIA_EXTENSIONS.common_video + ('3gp', )),
+        'storyboards': set(MEDIA_EXTENSIONS.storyboards),
      }
  
      def __init__(self, params=None, auto_init=True):
@@ -584,12 +591,13 @@ def __init__(self, params=None, auto_init=True):
              for type_, stream in self._out_files.items_ if type_ != 'console'
          })
  
-        MIN_SUPPORTED, MIN_RECOMMENDED = (3, 6), (3, 7)
+        # The code is left like this to be reused for future deprecations
+        MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 7)
          current_version = sys.version_info[:2]
          if current_version < MIN_RECOMMENDED:
              msg = ('Support for Python version %d.%d has been deprecated. '
-                   'See  https://github.com/yt-dlp/yt-dlp/issues/3764  for more details. '
-                   'You will recieve only one more update on this version')
+                   'See  https://github.com/yt-dlp/yt-dlp/issues/3764  for more details.'
+                   '\n                    You will no longer receive updates on this version')
              if current_version < MIN_SUPPORTED:
                  msg = 'Python version %d.%d is no longer supported'
              self.deprecation_warning(
@@ -1039,7 +1047,7 @@ def _outtmpl_expandpath(outtmpl):
  
          # outtmpl should be expand_path'ed before template dict substitution
          # because meta fields may contain env variables we don't want to
-        # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
+        # be expanded. E.g. for outtmpl "%(title)s.%(ext)s" and
          # title "Hello $PATH", we don't want `$PATH` to be expanded.
          return expand_path(outtmpl).replace(sep, '')
  
@@ -1159,6 +1167,9 @@ def get_value(mdict):
              if mdict['strf_format']:
                  value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ','))
  
+            # XXX: Workaround for https://github.com/yt-dlp/yt-dlp/issues/4485
+            if sanitize and value == '':
+                value = None
              return value
  
          na = self.params.get('outtmpl_na_placeholder', 'NA')
@@ -1308,7 +1319,7 @@ def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False):
      def _match_entry(self, info_dict, incomplete=False, silent=False):
          """ Returns None if the file should be downloaded """
  
-        video_title = info_dict.get('title', info_dict.get('id', 'video'))
+        video_title = info_dict.get('title', info_dict.get('id', 'entry'))
  
          def check_filter():
              if 'title' in info_dict:
@@ -1455,7 +1466,7 @@ def wrapper(self, *args, **kwargs):
                  break
          return wrapper
  
-    def _wait_for_video(self, ie_result):
+    def _wait_for_video(self, ie_result={}):
          if (not self.params.get('wait_for_video')
                  or ie_result.get('_type', 'video') != 'video'
                  or ie_result.get('formats') or ie_result.get('url')):
@@ -1479,7 +1490,7 @@ def progress(msg):
          if diff is None and ie_result.get('live_status') == 'is_upcoming':
              diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0)
              self.report_warning('Release time of video is not known')
-        elif (diff or 0) <= 0:
+        elif ie_result and (diff or 0) <= 0:
              self.report_warning('Video should already be available according to extracted info')
          diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf'))
          self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now')
@@ -1503,7 +1514,14 @@ def progress(msg):
  
      @_handle_extraction_exceptions
      def __extract_info(self, url, ie, download, extra_info, process):
-        ie_result = ie.extract(url)
+        try:
+            ie_result = ie.extract(url)
+        except UserNotLive as e:
+            if process:
+                if self.params.get('wait_for_video'):
+                    self.report_warning(e)
+                self._wait_for_video()
+            raise
          if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
              self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}')
              return
@@ -1553,7 +1571,8 @@ def process_ie_result(self, ie_result, download=True, extra_info=None):
          result_type = ie_result.get('_type', 'video')
  
          if result_type in ('url', 'url_transparent'):
-            ie_result['url'] = sanitize_url(ie_result['url'])
+            ie_result['url'] = sanitize_url(
+                ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
              if ie_result.get('original_url'):
                  extra_info.setdefault('original_url', ie_result['original_url'])
  
@@ -1676,24 +1695,38 @@ def _ensure_dir_exists(self, path):
          return make_dir(path, self.report_error)
  
      @staticmethod
-    def _playlist_infodict(ie_result, **kwargs):
-        return {
-            **ie_result,
+    def _playlist_infodict(ie_result, strict=False, **kwargs):
+        info = {
+            'playlist_count': ie_result.get('playlist_count'),
              'playlist': ie_result.get('title') or ie_result.get('id'),
              'playlist_id': ie_result.get('id'),
              'playlist_title': ie_result.get('title'),
              'playlist_uploader': ie_result.get('uploader'),
              'playlist_uploader_id': ie_result.get('uploader_id'),
-            'playlist_index': 0,
              **kwargs,
          }
+        if strict:
+            return info
+        return {
+            **info,
+            'playlist_index': 0,
+            '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
+            'extractor': ie_result['extractor'],
+            'webpage_url': ie_result['webpage_url'],
+            'webpage_url_basename': url_basename(ie_result['webpage_url']),
+            'webpage_url_domain': get_domain(ie_result['webpage_url']),
+            'extractor_key': ie_result['extractor_key'],
+        }
  
      def __process_playlist(self, ie_result, download):
          """Process each entry in the playlist"""
          assert ie_result['_type'] in ('playlist', 'multi_video')
  
-        title = ie_result.get('title') or ie_result.get('id') or '<Untitled>'
-        self.to_screen(f'[download] Downloading playlist: {title}')
+        common_info = self._playlist_infodict(ie_result, strict=True)
+        title = common_info.get('playlist') or '<Untitled>'
+        if self._match_entry(common_info, incomplete=True) is not None:
+            return
+        self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}')
  
          all_entries = PlaylistEntries(self, ie_result)
          entries = orderedSet(all_entries.get_requested_items(), lazy=True)
@@ -1710,12 +1743,14 @@ def __process_playlist(self, ie_result, download):
              # Better to do this after potentially exhausting entries
              ie_result['playlist_count'] = all_entries.get_full_count()
  
+        extra = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
+        ie_copy = collections.ChainMap(ie_result, extra)
+
          _infojson_written = False
          write_playlist_files = self.params.get('allow_playlist_files', True)
          if write_playlist_files and self.params.get('list_thumbnails'):
              self.list_thumbnails(ie_result)
          if write_playlist_files and not self.params.get('simulate'):
-            ie_copy = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries))
              _infojson_written = self._write_info_json(
                  'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
              if _infojson_written is None:
@@ -1724,7 +1759,7 @@ def __process_playlist(self, ie_result, download):
                                         self.prepare_filename(ie_copy, 'pl_description')) is None:
                  return
              # TODO: This should be passed to ThumbnailsConvertor if necessary
-            self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail'))
+            self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail'))
  
          if lazy:
              if self.params.get('playlistreverse') or self.params.get('playlistrandom'):
@@ -1748,35 +1783,33 @@ def __process_playlist(self, ie_result, download):
          for i, (playlist_index, entry) in enumerate(entries):
              if lazy:
                  resolved_entries.append((playlist_index, entry))
-
-            # TODO: Add auto-generated fields
-            if not entry or self._match_entry(entry, incomplete=True) is not None:
+            if not entry:
                  continue
  
-            self.to_screen('[download] Downloading video %s of %s' % (
-                self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
-
              entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip')
              if not lazy and 'playlist-index' in self.params.get('compat_opts', []):
                  playlist_index = ie_result['requested_entries'][i]
  
-            entry_result = self.__process_iterable_entry(entry, download, {
+            entry_copy = collections.ChainMap(entry, {
+                **common_info,
                  'n_entries': int_or_none(n_entries),
-                '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)),
-                'playlist_count': ie_result.get('playlist_count'),
                  'playlist_index': playlist_index,
                  'playlist_autonumber': i + 1,
-                'playlist': title,
-                'playlist_id': ie_result.get('id'),
-                'playlist_title': ie_result.get('title'),
-                'playlist_uploader': ie_result.get('uploader'),
-                'playlist_uploader_id': ie_result.get('uploader_id'),
-                'extractor': ie_result['extractor'],
-                'webpage_url': ie_result['webpage_url'],
-                'webpage_url_basename': url_basename(ie_result['webpage_url']),
-                'webpage_url_domain': get_domain(ie_result['webpage_url']),
-                'extractor_key': ie_result['extractor_key'],
              })
+
+            if self._match_entry(entry_copy, incomplete=True) is not None:
+                # For compatabilty with youtube-dl. See https://github.com/yt-dlp/yt-dlp/issues/4369
+                resolved_entries[i] = (playlist_index, NO_DEFAULT)
+                continue
+
+            self.to_screen('[download] Downloading video %s of %s' % (
+                self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS)))
+
+            extra.update({
+                'playlist_index': playlist_index,
+                'playlist_autonumber': i + 1,
+            })
+            entry_result = self.__process_iterable_entry(entry, download, extra)
              if not entry_result:
                  failures += 1
              if failures >= max_failures:
@@ -1787,7 +1820,8 @@ def __process_playlist(self, ie_result, download):
                  resolved_entries[i] = (playlist_index, entry_result)
  
          # Update with processed data
-        ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], [])
+        ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT]
+        ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT]
  
          # Write the updated info to json
          if _infojson_written is True and self._write_info_json(
@@ -1944,8 +1978,8 @@ def _parse_filter(tokens):
                      filter_parts.append(string)
  
          def _remove_unused_ops(tokens):
-            # Remove operators that we don't use and join them with the surrounding strings
-            # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
+            # Remove operators that we don't use and join them with the surrounding strings.
+            # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
              ALLOWED_OPS = ('/', '+', ',', '(', ')')
              last_string, last_start, last_end, last_line = None, None, None, None
              for type, string, start, end, line in tokens:
@@ -2061,14 +2095,13 @@ def _merge(formats_pair):
              the_only_video = video_fmts[0] if len(video_fmts) == 1 else None
              the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None
  
-            output_ext = self.params.get('merge_output_format')
-            if not output_ext:
-                if the_only_video:
-                    output_ext = the_only_video['ext']
-                elif the_only_audio and not video_fmts:
-                    output_ext = the_only_audio['ext']
-                else:
-                    output_ext = 'mkv'
+            output_ext = get_compatible_ext(
+                vcodecs=[f.get('vcodec') for f in video_fmts],
+                acodecs=[f.get('acodec') for f in audio_fmts],
+                vexts=[f['ext'] for f in video_fmts],
+                aexts=[f['ext'] for f in audio_fmts],
+                preferences=(try_call(lambda: self.params['merge_output_format'].split('/'))
+                             or self.params.get('prefer_free_formats') and ('webm', 'mkv')))
  
              filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info))
  
@@ -2101,6 +2134,7 @@ def _merge(formats_pair):
                      'acodec': the_only_audio.get('acodec'),
                      'abr': the_only_audio.get('abr'),
                      'asr': the_only_audio.get('asr'),
+                    'audio_channels': the_only_audio.get('audio_channels')
                  })
  
              return new_dict
@@ -2461,7 +2495,7 @@ def sanitize_numeric_fields(info):
          info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None
          if not self.params.get('allow_unplayable_formats'):
              formats = [f for f in formats if not f.get('has_drm')]
-            if info_dict['_has_drm'] and all(
+            if info_dict['_has_drm'] and formats and all(
                      f.get('acodec') == f.get('vcodec') == 'none' for f in formats):
                  self.report_warning(
                      'This video is DRM protected and only images are available for download. '
@@ -3040,33 +3074,9 @@ def existing_video_file(*filepaths):
                          return
  
                  if info_dict.get('requested_formats') is not None:
-
-                    def compatible_formats(formats):
-                        # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them.
-                        video_formats = [format for format in formats if format.get('vcodec') != 'none']
-                        audio_formats = [format for format in formats if format.get('acodec') != 'none']
-                        if len(video_formats) > 2 or len(audio_formats) > 2:
-                            return False
-
-                        # Check extension
-                        exts = {format.get('ext') for format in formats}
-                        COMPATIBLE_EXTS = (
-                            {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'},
-                            {'webm'},
-                        )
-                        for ext_sets in COMPATIBLE_EXTS:
-                            if ext_sets.issuperset(exts):
-                                return True
-                        # TODO: Check acodec/vcodec
-                        return False
-
                      requested_formats = info_dict['requested_formats']
                      old_ext = info_dict['ext']
                      if self.params.get('merge_output_format') is None:
-                        if not compatible_formats(requested_formats):
-                            info_dict['ext'] = 'mkv'
-                            self.report_warning(
-                                'Requested formats are incompatible for merge and will be merged into mkv')
                          if (info_dict['ext'] == 'webm'
                                  and info_dict.get('thumbnails')
                                  # check with type instead of pp_key, __name__, or isinstance
@@ -3426,18 +3436,16 @@ def _make_archive_id(self, info_dict):
                      break
              else:
                  return
-        return f'{extractor.lower()} {video_id}'
+        return make_archive_id(extractor, video_id)
  
      def in_download_archive(self, info_dict):
          fn = self.params.get('download_archive')
          if fn is None:
              return False
  
-        vid_id = self._make_archive_id(info_dict)
-        if not vid_id:
-            return False  # Incomplete video information
-
-        return vid_id in self.archive
+        vid_ids = [self._make_archive_id(info_dict)]
+        vid_ids.extend(info_dict.get('_old_archive_ids') or [])
+        return any(id_ in self.archive for id_ in vid_ids)
  
      def record_download_archive(self, info_dict):
          fn = self.params.get('download_archive')
@@ -3567,6 +3575,7 @@ def simplified_codec(f, field):
                  format_field(f, func=self.format_resolution, ignore=('audio only', 'images')),
                  format_field(f, 'fps', '\t%d', func=round),
                  format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''),
+                format_field(f, 'audio_channels', '\t%s'),
                  delim,
                  format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes),
                  format_field(f, 'tbr', '\t%dk', func=round),
@@ -3586,7 +3595,7 @@ def simplified_codec(f, field):
                      delim=' '),
              ] for f in formats if f.get('preference') is None or f['preference'] >= -1000]
          header_line = self._list_format_headers(
-            'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO',
+            'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO',
              delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO')
  
          return render_table(
@@ -3671,6 +3680,8 @@ def get_encoding(stream):
              write_debug = lambda msg: self._write_string(f'[debug] {msg}\n')
  
          source = detect_variant()
+        if VARIANT not in (None, 'pip'):
+            source += '*'
          write_debug(join_nonempty(
              'yt-dlp version', __version__,
              f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',