Expand `--check-formats` to thumbnails

[yt-dlp.git] / yt_dlp / YoutubeDL.py
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py

index 0edbb4119fb393132a7f2b6191d79f9fbfd17261..b1bc05a80e7e4e18ed9d35cd30da69d4e1ddae59 100644 (file)
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -20,6 +20,7 @@
  import shutil
  import subprocess
  import sys
+import tempfile
  import time
  import tokenize
  import traceback
@@ -67,6 +68,7 @@
      STR_FORMAT_RE,
      formatSeconds,
      GeoRestrictedError,
+    HEADRequest,
      int_or_none,
      iri_to_uri,
      ISO3166Utils,
@@ -86,7 +88,6 @@
      preferredencoding,
      prepend_extension,
      process_communicate_or_kill,
-    random_uuidv4,
      register_socks_protocols,
      RejectedVideoReached,
      render_table,
@@ -472,8 +473,7 @@ def __init__(self, params=None, auto_init=True):
  
          if sys.version_info < (3, 6):
              self.report_warning(
-                'Support for Python version %d.%d have been deprecated and will break in future versions of yt-dlp! '
-                'Update to Python 3.6 or above' % sys.version_info[:2])
+                'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2])
  
          def check_deprecated(param, option, suggestion):
              if self.params.get(param) is not None:
@@ -539,6 +539,11 @@ def check_deprecated(param, option, suggestion):
  
          self.outtmpl_dict = self.parse_outtmpl()
  
+        # Creating format selector here allows us to catch syntax errors before the extraction
+        self.format_selector = (
+            None if self.params.get('format') is None
+            else self.build_format_selector(self.params['format']))
+
          self._setup_opener()
  
          """Preload the archive, if any is specified"""
@@ -813,6 +818,21 @@ def parse_outtmpl(self):
                      'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
          return outtmpl_dict
  
+    def get_output_path(self, dir_type='', filename=None):
+        paths = self.params.get('paths', {})
+        assert isinstance(paths, dict)
+        path = os.path.join(
+            expand_path(paths.get('home', '').strip()),
+            expand_path(paths.get(dir_type, '').strip()) if dir_type else '',
+            filename or '')
+
+        # Temporary fix for #4787
+        # 'Treat' all problem characters by passing filename through preferredencoding
+        # to workaround encoding issues with subprocess on python2 @ Windows
+        if sys.version_info < (3, 0) and sys.platform == 'win32':
+            path = encodeFilename(path, True).decode(preferredencoding())
+        return sanitize_path(path, force=self.params.get('windowsfilenames'))
+
      @staticmethod
      def validate_outtmpl(tmpl):
          ''' @return None or Exception object '''
@@ -990,12 +1010,11 @@ def _prepare_filename(self, info_dict, tmpl_type='default'):
  
      def prepare_filename(self, info_dict, dir_type='', warn=False):
          """Generate the output filename."""
-        paths = self.params.get('paths', {})
-        assert isinstance(paths, dict)
+
          filename = self._prepare_filename(info_dict, dir_type or 'default')
  
          if warn and not self.__prepare_filename_warned:
-            if not paths:
+            if not self.params.get('paths'):
                  pass
              elif filename == '-':
                  self.report_warning('--paths is ignored when an outputting to stdout')
@@ -1005,18 +1024,7 @@ def prepare_filename(self, info_dict, dir_type='', warn=False):
          if filename == '-' or not filename:
              return filename
  
-        homepath = expand_path(paths.get('home', '').strip())
-        assert isinstance(homepath, compat_str)
-        subdir = expand_path(paths.get(dir_type, '').strip()) if dir_type else ''
-        assert isinstance(subdir, compat_str)
-        path = os.path.join(homepath, subdir, filename)
-
-        # Temporary fix for #4787
-        # 'Treat' all problem characters by passing filename through preferredencoding
-        # to workaround encoding issues with subprocess on python2 @ Windows
-        if sys.version_info < (3, 0) and sys.platform == 'win32':
-            path = encodeFilename(path, True).decode(preferredencoding())
-        return sanitize_path(path, force=self.params.get('windowsfilenames'))
+        return self.get_output_path(dir_type, filename)
  
      def _match_entry(self, info_dict, incomplete=False, silent=False):
          """ Returns None if the file should be downloaded """
@@ -1488,12 +1496,11 @@ def _build_format_filter(self, filter_spec):
              '!=': operator.ne,
          }
          operator_rex = re.compile(r'''(?x)\s*
-            (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
-            \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
-            (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
-            $
+            (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)\s*
+            (?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+            (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)\s*
              ''' % '|'.join(map(re.escape, OPERATORS.keys())))
-        m = operator_rex.search(filter_spec)
+        m = operator_rex.fullmatch(filter_spec)
          if m:
              try:
                  comparison_value = int(m.group('value'))
@@ -1514,13 +1521,12 @@ def _build_format_filter(self, filter_spec):
                  '$=': lambda attr, value: attr.endswith(value),
                  '*=': lambda attr, value: value in attr,
              }
-            str_operator_rex = re.compile(r'''(?x)
-                \s*(?P<key>[a-zA-Z0-9._-]+)
-                \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
-                \s*(?P<value>[a-zA-Z0-9._-]+)
-                \s*$
+            str_operator_rex = re.compile(r'''(?x)\s*
+                (?P<key>[a-zA-Z0-9._-]+)\s*
+                (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+                (?P<value>[a-zA-Z0-9._-]+)\s*
                  ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
-            m = str_operator_rex.search(filter_spec)
+            m = str_operator_rex.fullmatch(filter_spec)
              if m:
                  comparison_value = m.group('value')
                  str_op = STR_OPERATORS[m.group('op')]
@@ -1530,7 +1536,7 @@ def _build_format_filter(self, filter_spec):
                      op = str_op
  
          if not m:
-            raise ValueError('Invalid filter specification %r' % filter_spec)
+            raise SyntaxError('Invalid filter specification %r' % filter_spec)
  
          def _filter(f):
              actual_value = f.get(m.group('key'))
@@ -1740,18 +1746,20 @@ def _merge(formats_pair):
          def _check_formats(formats):
              for f in formats:
                  self.to_screen('[info] Testing format %s' % f['format_id'])
-                paths = self.params.get('paths', {})
-                temp_file = os.path.join(
-                    expand_path(paths.get('home', '').strip()),
-                    expand_path(paths.get('temp', '').strip()),
-                    'ytdl.%s.f%s.check-format' % (random_uuidv4(), f['format_id']))
+                temp_file = tempfile.NamedTemporaryFile(
+                    suffix='.tmp', delete=False,
+                    dir=self.get_output_path('temp') or None)
+                temp_file.close()
                  try:
-                    dl, _ = self.dl(temp_file, f, test=True)
+                    dl, _ = self.dl(temp_file.name, f, test=True)
                  except (ExtractorError, IOError, OSError, ValueError) + network_exceptions:
                      dl = False
                  finally:
-                    if os.path.exists(temp_file):
-                        os.remove(temp_file)
+                    if os.path.exists(temp_file.name):
+                        try:
+                            os.remove(temp_file.name)
+                        except OSError:
+                            self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
                  if dl:
                      yield f
                  else:
@@ -1914,8 +1922,7 @@ def _calc_cookies(self, info_dict):
          self.cookiejar.add_cookie_header(pr)
          return pr.get_header('Cookie')
  
-    @staticmethod
-    def _sanitize_thumbnails(info_dict):
+    def _sanitize_thumbnails(self, info_dict):
          thumbnails = info_dict.get('thumbnails')
          if thumbnails is None:
              thumbnail = info_dict.get('thumbnail')
@@ -1928,12 +1935,25 @@ def _sanitize_thumbnails(info_dict):
                  t.get('height') if t.get('height') is not None else -1,
                  t.get('id') if t.get('id') is not None else '',
                  t.get('url')))
+
+            def test_thumbnail(t):
+                self.to_screen('[info] Testing thumbnail %s' % t['id'])
+                try:
+                    self.urlopen(HEADRequest(t['url']))
+                except network_exceptions as err:
+                    self.to_screen('[info] Unable to connect to thumbnail %s URL "%s" - %s. Skipping...' % (
+                        t['id'], t['url'], error_to_compat_str(err)))
+                    return False
+                return True
+
              for i, t in enumerate(thumbnails):
-                t['url'] = sanitize_url(t['url'])
-                if t.get('width') and t.get('height'):
-                    t['resolution'] = '%dx%d' % (t['width'], t['height'])
                  if t.get('id') is None:
                      t['id'] = '%d' % i
+                if t.get('width') and t.get('height'):
+                    t['resolution'] = '%dx%d' % (t['width'], t['height'])
+                t['url'] = sanitize_url(t['url'])
+            if self.params.get('check_formats'):
+                info_dict['thumbnails'] = reversed(LazyList(filter(test_thumbnail, thumbnails[::-1])))
  
      def process_video_result(self, info_dict, download=True):
          assert info_dict.get('_type', 'video') == 'video'
@@ -2119,12 +2139,11 @@ def is_wellformed(f):
              self.list_formats(info_dict)
              return
  
-        req_format = self.params.get('format')
-        if req_format is None:
+        format_selector = self.format_selector
+        if format_selector is None:
              req_format = self._default_format_spec(info_dict, download=download)
              self.write_debug('Default format spec: %s' % req_format)
-
-        format_selector = self.build_format_selector(req_format)
+            format_selector = self.build_format_selector(req_format)
  
          # While in format selection we may need to have an access to the original
          # format set in order to calculate some metrics or do some processing.
@@ -2798,7 +2817,7 @@ def filter_requested_info(info_dict, actually_filter=True):
              info_dict['epoch'] = int(time.time())
              reject = lambda k, v: k in remove_keys
          filter_fn = lambda obj: (
-            list(map(filter_fn, obj)) if isinstance(obj, (list, tuple, set))
+            list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set))
              else obj if not isinstance(obj, dict)
              else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)))
          return filter_fn(info_dict)
@@ -3036,7 +3055,7 @@ def list_formats(self, info_dict):
                  hideEmpty=new_format)))
  
      def list_thumbnails(self, info_dict):
-        thumbnails = info_dict.get('thumbnails')
+        thumbnails = list(info_dict.get('thumbnails'))
          if not thumbnails:
              self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
              return
@@ -3246,6 +3265,7 @@ def _write_thumbnails(self, info_dict, filename):  # return the extensions
  
              if not self.params.get('overwrites', True) and os.path.exists(encodeFilename(thumb_filename)):
                  ret.append(suffix + thumb_ext)
+                t['filepath'] = thumb_filename
                  self.to_screen('[%s] %s: Thumbnail %sis already present' %
                                 (info_dict['extractor'], info_dict['id'], thumb_display_id))
              else: