Add option `--break-match-filters`

[yt-dlp.git] / yt_dlp / utils.py
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 15e1f97cbfad72b6e2bf001388bfd6a27f703217..e9b8894473f39f228145d11b490778b12ec884ee 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -593,21 +593,43 @@ def clean_html(html):
  
  
  class LenientJSONDecoder(json.JSONDecoder):
-    def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
+    # TODO: Write tests
+    def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
          self.transform_source, self.ignore_extra = transform_source, ignore_extra
+        self._close_attempts = 2 * close_objects
          super().__init__(*args, **kwargs)
  
+    @staticmethod
+    def _close_object(err):
+        doc = err.doc[:err.pos]
+        # We need to add comma first to get the correct error message
+        if err.msg.startswith('Expecting \',\''):
+            return doc + ','
+        elif not doc.endswith(','):
+            return
+
+        if err.msg.startswith('Expecting property name'):
+            return doc[:-1] + '}'
+        elif err.msg.startswith('Expecting value'):
+            return doc[:-1] + ']'
+
      def decode(self, s):
          if self.transform_source:
              s = self.transform_source(s)
-        try:
-            if self.ignore_extra:
-                return self.raw_decode(s.lstrip())[0]
-            return super().decode(s)
-        except json.JSONDecodeError as e:
-            if e.pos is not None:
+        for attempt in range(self._close_attempts + 1):
+            try:
+                if self.ignore_extra:
+                    return self.raw_decode(s.lstrip())[0]
+                return super().decode(s)
+            except json.JSONDecodeError as e:
+                if e.pos is None:
+                    raise
+                elif attempt < self._close_attempts:
+                    s = self._close_object(e)
+                    if s is not None:
+                        continue
                  raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
-            raise
+        assert False, 'Too many attempts to decode JSON'
  
  
  def sanitize_open(filename, open_mode):
@@ -879,6 +901,7 @@ def __init__(self, *args, env=None, text=False, **kwargs):
              env = os.environ.copy()
          self._fix_pyinstaller_ld_path(env)
  
+        self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
          if text is True:
              kwargs['universal_newlines'] = True  # For 3.6 compatibility
              kwargs.setdefault('encoding', 'utf-8')
@@ -900,7 +923,7 @@ def kill(self, *, timeout=0):
      @classmethod
      def run(cls, *args, timeout=None, **kwargs):
          with cls(*args, **kwargs) as proc:
-            default = '' if proc.text_mode else b''
+            default = '' if proc.__text_mode else b''
              stdout, stderr = proc.communicate_or_kill(timeout=timeout)
              return stdout or default, stderr or default, proc.returncode
  
@@ -1207,8 +1230,8 @@ class ExistingVideoReached(DownloadCancelled):
  
  
  class RejectedVideoReached(DownloadCancelled):
-    """ --break-on-reject triggered """
-    msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
+    """ --break-match-filter triggered """
+    msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
  
  
  class MaxDownloadsReached(DownloadCancelled):
@@ -1438,19 +1461,16 @@ def http_response(self, req, resp):
                      raise original_ioerror
              resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
-            del resp.headers['Content-encoding']
          # deflate
          if resp.headers.get('Content-encoding', '') == 'deflate':
              gz = io.BytesIO(self.deflate(resp.read()))
              resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
-            del resp.headers['Content-encoding']
          # brotli
          if resp.headers.get('Content-encoding', '') == 'br':
              resp = urllib.request.addinfourl(
                  io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
-            del resp.headers['Content-encoding']
          # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
          # https://github.com/ytdl-org/youtube-dl/issues/6457).
          if 300 <= resp.code < 400:
@@ -2106,7 +2126,7 @@ class OVERLAPPED(ctypes.Structure):
              ('hEvent', ctypes.wintypes.HANDLE),
          ]
  
-    kernel32 = ctypes.windll.kernel32
+    kernel32 = ctypes.WinDLL('kernel32')
      LockFileEx = kernel32.LockFileEx
      LockFileEx.argtypes = [
          ctypes.wintypes.HANDLE,     # hFile
@@ -3022,8 +3042,10 @@ def get_requested_items(self):
                  if not entry:
                      continue
                  try:
-                    # TODO: Add auto-generated fields
-                    self.ydl._match_entry(entry, incomplete=True, silent=True)
+                    # The item may have just been added to archive. Don't break due to it
+                    if not self.ydl.params.get('lazy_playlist'):
+                        # TODO: Add auto-generated fields
+                        self.ydl._match_entry(entry, incomplete=True, silent=True)
                  except (ExistingVideoReached, RejectedVideoReached):
                      return
  
@@ -3152,14 +3174,28 @@ def urlencode_postdata(*args, **kargs):
      return urllib.parse.urlencode(*args, **kargs).encode('ascii')
  
  
+def update_url(url, *, query_update=None, **kwargs):
+    """Replace URL components specified by kwargs
+       @param url           str or parse url tuple
+       @param query_update  update query
+       @returns             str
+    """
+    if isinstance(url, str):
+        if not kwargs and not query_update:
+            return url
+        else:
+            url = urllib.parse.urlparse(url)
+    if query_update:
+        assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
+        kwargs['query'] = urllib.parse.urlencode({
+            **urllib.parse.parse_qs(url.query),
+            **query_update
+        }, True)
+    return urllib.parse.urlunparse(url._replace(**kwargs))
+
+
  def update_url_query(url, query):
-    if not query:
-        return url
-    parsed_url = urllib.parse.urlparse(url)
-    qs = urllib.parse.parse_qs(parsed_url.query)
-    qs.update(query)
-    return urllib.parse.urlunparse(parsed_url._replace(
-        query=urllib.parse.urlencode(qs, True)))
+    return update_url(url, query_update=query)
  
  
  def update_Request(req, url=None, data=None, headers=None, query=None):
@@ -3385,6 +3421,8 @@ def create_map(mobj):
      if not strict:
          code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
          code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
+        code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
+        code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
  
      return re.sub(rf'''(?sx)
          {STRING_RE}|
@@ -3651,7 +3689,8 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
          },
      }
  
-    sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
+    sanitize_codec = functools.partial(
+        try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
      vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
  
      for ext in preferences or COMPATIBLE_CODECS.keys():
@@ -3872,16 +3911,21 @@ def match_str(filter_str, dct, incomplete=False):
          for filter_part in re.split(r'(?<!\\)&', filter_str))
  
  
-def match_filter_func(filters):
-    if not filters:
+def match_filter_func(filters, breaking_filters=None):
+    if not filters and not breaking_filters:
          return None
-    filters = set(variadic(filters))
+    breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
+    filters = set(variadic(filters or []))
  
      interactive = '-' in filters
      if interactive:
          filters.remove('-')
  
      def _match_func(info_dict, incomplete=False):
+        ret = breaking_filters(info_dict, incomplete)
+        if ret is not None:
+            raise RejectedVideoReached(ret)
+
          if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
              return NO_DEFAULT if interactive and not incomplete else None
          else:
@@ -3916,7 +3960,7 @@ def __eq__(self, other):
                  and self.chapters == other.chapters and self.ranges == other.ranges)
  
      def __repr__(self):
-        return f'{type(self).__name__}({self.chapters}, {self.ranges})'
+        return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
  
  
  def parse_dfxp_time_expr(time_expr):
@@ -5371,8 +5415,8 @@ def random_uuidv4():
  def make_dir(path, to_screen=None):
      try:
          dn = os.path.dirname(path)
-        if dn and not os.path.exists(dn):
-            os.makedirs(dn)
+        if dn:
+            os.makedirs(dn, exist_ok=True)
          return True
      except OSError as err:
          if callable(to_screen) is not None:
@@ -5418,12 +5462,15 @@ def traverse_obj(
      Each of the provided `paths` is tested and the first producing a valid result will be returned.
      The next path will also be tested if the path branched but no results could be found.
      Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
-    A value of None is treated as the absence of a value.
+    Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
  
      The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
  
      The keys in the path can be one of:
          - `None`:           Return the current object.
+        - `set`:            Requires the only item in the set to be a type or function,
+                            like `{type}`/`{func}`. If a `type`, returns only values
+                            of this type. If a function, returns `func(obj)`.
          - `str`/`int`:      Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
          - `slice`:          Branch out and return all values in `obj[key]`.
          - `Ellipsis`:       Branch out and return a list of all values.
@@ -5432,6 +5479,8 @@ def traverse_obj(
          - `function`:       Branch out and return values filtered by the function.
                              Read as: `[value for key, value in obj if function(key, value)]`.
                              For `Sequence`s, `key` is the index of the value.
+                            For `re.Match`es, `key` is the group number (0 = full match)
+                            as well as additionally any group names, if given.
          - `dict`            Transform the current object and return a matching dict.
                              Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
  
@@ -5439,8 +5488,12 @@ def traverse_obj(
  
      @params paths           Paths which to traverse by.
      @param default          Value to return if the paths do not match.
+                            If the last key in the path is a `dict`, it will apply to each value inside
+                            the dict instead, depth first. Try to avoid if using nested `dict` keys.
      @param expected_type    If a `type`, only accept final values of this type.
                              If any other callable, try to call the function on each result.
+                            If the last key in the path is a `dict`, it will apply to each value inside
+                            the dict instead, recursively. This does respect branching paths.
      @param get_all          If `False`, return the first matching result, otherwise all matching ones.
      @param casesense        If `False`, consider string dictionary keys as case insensitive.
  
@@ -5451,12 +5504,15 @@ def traverse_obj(
      @param traverse_string  Whether to traverse into objects as strings.
                              If `True`, any non-compatible object will first be
                              converted into a string and then traversed into.
+                            The return value of that path will be a string instead,
+                            not respecting any further branching.
  
  
      @returns                The result of the object traversal.
                              If successful, `get_all=True`, and the path branches at least once,
                              then a list of results is returned instead.
-                            A list is always returned if the last path branches and no `default` is given.
+                            If no `default` is given and the last path branches, a `list` of results
+                            is always returned. If a path ends on a `dict` that result will always be a `dict`.
      """
      is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
      casefold = lambda k: k.casefold() if isinstance(k, str) else k
@@ -5466,108 +5522,154 @@ def traverse_obj(
      else:
          type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
  
-    def apply_key(key, obj):
-        if obj is None:
-            return
+    def apply_key(key, obj, is_last):
+        branching = False
+        result = None
+
+        if obj is None and traverse_string:
+            pass
  
          elif key is None:
-            yield obj
+            result = obj
+
+        elif isinstance(key, set):
+            assert len(key) == 1, 'Set should only be used to wrap a single item'
+            item = next(iter(key))
+            if isinstance(item, type):
+                if isinstance(obj, item):
+                    result = obj
+            else:
+                result = try_call(item, args=(obj,))
  
          elif isinstance(key, (list, tuple)):
-            for branch in key:
-                _, result = apply_path(obj, branch)
-                yield from result
+            branching = True
+            result = itertools.chain.from_iterable(
+                apply_path(obj, branch, is_last)[0] for branch in key)
  
          elif key is ...:
+            branching = True
              if isinstance(obj, collections.abc.Mapping):
-                yield from obj.values()
+                result = obj.values()
              elif is_sequence(obj):
-                yield from obj
+                result = obj
              elif isinstance(obj, re.Match):
-                yield from obj.groups()
+                result = obj.groups()
              elif traverse_string:
-                yield from str(obj)
+                branching = False
+                result = str(obj)
+            else:
+                result = ()
  
          elif callable(key):
-            if is_sequence(obj):
-                iter_obj = enumerate(obj)
-            elif isinstance(obj, collections.abc.Mapping):
+            branching = True
+            if isinstance(obj, collections.abc.Mapping):
                  iter_obj = obj.items()
+            elif is_sequence(obj):
+                iter_obj = enumerate(obj)
              elif isinstance(obj, re.Match):
-                iter_obj = enumerate((obj.group(), *obj.groups()))
+                iter_obj = itertools.chain(
+                    enumerate((obj.group(), *obj.groups())),
+                    obj.groupdict().items())
              elif traverse_string:
+                branching = False
                  iter_obj = enumerate(str(obj))
              else:
-                return
-            yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
+                iter_obj = ()
+
+            result = (v for k, v in iter_obj if try_call(key, args=(k, v)))
+            if not branching:  # string traversal
+                result = ''.join(result)
  
          elif isinstance(key, dict):
-            iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
-            yield {k: v if v is not None else default for k, v in iter_obj
-                   if v is not None or default is not NO_DEFAULT}
+            iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items())
+            result = {
+                k: v if v is not None else default for k, v in iter_obj
+                if v is not None or default is not NO_DEFAULT
+            } or None
  
          elif isinstance(obj, collections.abc.Mapping):
-            yield (obj.get(key) if casesense or (key in obj)
-                   else next((v for k, v in obj.items() if casefold(k) == key), None))
+            result = (obj.get(key) if casesense or (key in obj) else
+                      next((v for k, v in obj.items() if casefold(k) == key), None))
  
          elif isinstance(obj, re.Match):
              if isinstance(key, int) or casesense:
                  with contextlib.suppress(IndexError):
-                    yield obj.group(key)
-                    return
+                    result = obj.group(key)
  
-            if not isinstance(key, str):
-                return
+            elif isinstance(key, str):
+                result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
  
-            yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
+        elif isinstance(key, (int, slice)):
+            if is_sequence(obj):
+                branching = isinstance(key, slice)
+                with contextlib.suppress(IndexError):
+                    result = obj[key]
+            elif traverse_string:
+                with contextlib.suppress(IndexError):
+                    result = str(obj)[key]
  
-        else:
-            if is_user_input:
-                key = (int_or_none(key) if ':' not in key
-                       else slice(*map(int_or_none, key.split(':'))))
+        return branching, result if branching else (result,)
  
-            if not isinstance(key, (int, slice)):
-                return
+    def lazy_last(iterable):
+        iterator = iter(iterable)
+        prev = next(iterator, NO_DEFAULT)
+        if prev is NO_DEFAULT:
+            return
  
-            if not is_sequence(obj):
-                if not traverse_string:
-                    return
-                obj = str(obj)
+        for item in iterator:
+            yield False, prev
+            prev = item
  
-            with contextlib.suppress(IndexError):
-                yield obj[key]
+        yield True, prev
  
-    def apply_path(start_obj, path):
+    def apply_path(start_obj, path, test_type):
          objs = (start_obj,)
          has_branched = False
  
-        for key in variadic(path):
-            if is_user_input and key == ':':
-                key = ...
+        key = None
+        for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
+            if is_user_input and isinstance(key, str):
+                if key == ':':
+                    key = ...
+                elif ':' in key:
+                    key = slice(*map(int_or_none, key.split(':')))
+                elif int_or_none(key) is not None:
+                    key = int(key)
  
              if not casesense and isinstance(key, str):
                  key = key.casefold()
  
-            if key is ... or isinstance(key, (list, tuple)) or callable(key):
-                has_branched = True
+            if __debug__ and callable(key):
+                # Verify function signature
+                inspect.signature(key).bind(None, None)
+
+            new_objs = []
+            for obj in objs:
+                branching, results = apply_key(key, obj, last)
+                has_branched |= branching
+                new_objs.append(results)
  
-            key_func = functools.partial(apply_key, key)
-            objs = itertools.chain.from_iterable(map(key_func, objs))
+            objs = itertools.chain.from_iterable(new_objs)
  
-        return has_branched, objs
+        if test_type and not isinstance(key, (dict, list, tuple)):
+            objs = map(type_test, objs)
  
-    def _traverse_obj(obj, path, use_list=True):
-        has_branched, results = apply_path(obj, path)
-        results = LazyList(x for x in map(type_test, results) if x is not None)
+        return objs, has_branched, isinstance(key, dict)
  
+    def _traverse_obj(obj, path, allow_empty, test_type):
+        results, has_branched, is_dict = apply_path(obj, path, test_type)
+        results = LazyList(item for item in results if item not in (None, {}))
          if get_all and has_branched:
-            return results.exhaust() if results or use_list else None
+            if results:
+                return results.exhaust()
+            if allow_empty:
+                return [] if default is NO_DEFAULT else default
+            return None
  
-        return results[0] if results else None
+        return results[0] if results else {} if allow_empty and is_dict else None
  
      for index, path in enumerate(paths, 1):
-        use_list = default is NO_DEFAULT and index == len(paths)
-        result = _traverse_obj(obj, path, use_list)
+        result = _traverse_obj(obj, path, index == len(paths), True)
          if result is not None:
              return result
  
@@ -5585,8 +5687,10 @@ def get_first(obj, keys, **kwargs):
  
  
  def time_seconds(**kwargs):
-    t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
-    return t.timestamp()
+    """
+    Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
+    """
+    return time.time() + datetime.timedelta(**kwargs).total_seconds()
  
  
  # create a JSON Web Signature (jws) with HS256 algorithm
@@ -5959,6 +6063,20 @@ def __get__(self, _, cls):
          return self._cache[cls]
  
  
+class function_with_repr:
+    def __init__(self, func, repr_=None):
+        functools.update_wrapper(self, func)
+        self.func, self.__repr = func, repr_
+
+    def __call__(self, *args, **kwargs):
+        return self.func(*args, **kwargs)
+
+    def __repr__(self):
+        if self.__repr:
+            return self.__repr
+        return f'{self.func.__module__}.{self.func.__qualname__}'
+
+
  class Namespace(types.SimpleNamespace):
      """Immutable namespace"""