[extractor/iq] Increase phantomjs timeout

[yt-dlp.git] / yt_dlp / utils.py
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 26ef3c7dd1b9713ab4a32f4a48f86c5dab27a6bc..adb7c0e8c5d4686a4e0e5b3b9c669331e293ab8f 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -5,6 +5,7 @@
  import calendar
  import codecs
  import collections
+import collections.abc
  import contextlib
  import datetime
  import email.header
@@ -231,7 +232,7 @@ def random_user_agent():
  ])
  
  PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
-JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
+JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
  
  NUMBER_RE = r'\d+(?:\.\d+)?'
  
@@ -407,7 +408,7 @@ def get_elements_html_by_attribute(*args, **kwargs):
      return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
  
  
-def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
+def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
      """
      Return the text (content) and the html (whole) of the tag with the specified
      attribute in the passed HTML document
@@ -418,7 +419,7 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value
      value = re.escape(value) if escape_value else value
  
      partial_element_re = rf'''(?x)
-        <(?P<tag>[a-zA-Z0-9:._-]+)
+        <(?P<tag>{tag})
           (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
           \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
          '''
@@ -890,8 +891,9 @@ def kill(self, *, timeout=0):
      @classmethod
      def run(cls, *args, timeout=None, **kwargs):
          with cls(*args, **kwargs) as proc:
+            default = '' if proc.text_mode else b''
              stdout, stderr = proc.communicate_or_kill(timeout=timeout)
-            return stdout or '', stderr or '', proc.returncode
+            return stdout or default, stderr or default, proc.returncode
  
  
  def get_subprocess_encoding():
@@ -2572,7 +2574,9 @@ def strftime_or_none(timestamp, date_format, default=None):
      datetime_object = None
      try:
          if isinstance(timestamp, (int, float)):  # unix timestamp
-            datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
+            # Using naive datetime here can break timestamp() in Windows
+            # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
+            datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
          elif isinstance(timestamp, str):  # assume YYYYMMDD
              datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
          date_format = re.sub(  # Support %s on windows
@@ -3178,6 +3182,10 @@ def multipart_encode(data, boundary=None):
      return out, content_type
  
  
+def variadic(x, allowed_types=(str, bytes, dict)):
+    return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
+
+
  def dict_get(d, key_or_keys, default=None, skip_false_values=True):
      for val in map(d.get, variadic(key_or_keys)):
          if val is not None and (val or not skip_false_values):
@@ -3189,7 +3197,7 @@ def try_call(*funcs, expected_type=None, args=[], kwargs={}):
      for f in funcs:
          try:
              val = f(*args, **kwargs)
-        except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
+        except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
              pass
          else:
              if expected_type is None or isinstance(val, expected_type):
@@ -3267,6 +3275,8 @@ def strip_jsonp(code):
  
  def js_to_json(code, vars={}, *, strict=False):
      # vars is a dict of var, val pairs to substitute
+    STRING_QUOTES = '\'"'
+    STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
      COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
      SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
      INTEGER_TABLE = (
@@ -3274,6 +3284,15 @@ def js_to_json(code, vars={}, *, strict=False):
          (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
      )
  
+    def process_escape(match):
+        JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
+        escape = match.group(1) or match.group(2)
+
+        return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
+                else R'\u00' if escape == 'x'
+                else '' if escape == '\n'
+                else escape)
+
      def fix_kv(m):
          v = m.group(0)
          if v in ('true', 'false', 'null'):
@@ -3281,28 +3300,25 @@ def fix_kv(m):
          elif v in ('undefined', 'void 0'):
              return 'null'
          elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
-            return ""
-
-        if v[0] in ("'", '"'):
-            v = re.sub(r'(?s)\\.|"', lambda m: {
-                '"': '\\"',
-                "\\'": "'",
-                '\\\n': '',
-                '\\x': '\\u00',
-            }.get(m.group(0), m.group(0)), v[1:-1])
-        else:
-            for regex, base in INTEGER_TABLE:
-                im = re.match(regex, v)
-                if im:
-                    i = int(im.group(1), base)
-                    return '"%d":' % i if v.endswith(':') else '%d' % i
+            return ''
+
+        if v[0] in STRING_QUOTES:
+            escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
+            return f'"{escaped}"'
+
+        for regex, base in INTEGER_TABLE:
+            im = re.match(regex, v)
+            if im:
+                i = int(im.group(1), base)
+                return f'"{i}":' if v.endswith(':') else str(i)
  
-            if v in vars:
-                return vars[v]
-            if strict:
-                raise ValueError(f'Unknown value: {v}')
+        if v in vars:
+            return json.dumps(vars[v])
  
-        return '"%s"' % v
+        if not strict:
+            return f'"{v}"'
+
+        raise ValueError(f'Unknown value: {v}')
  
      def create_map(mobj):
          return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
@@ -3310,16 +3326,16 @@ def create_map(mobj):
      code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
      if not strict:
          code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
+        code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
  
-    return re.sub(r'''(?sx)
-        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
-        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
-        {comment}|,(?={skip}[\]}}])|
+    return re.sub(rf'''(?sx)
+        {STRING_RE}|
+        {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
          void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
-        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
-        [0-9]+(?={skip}:)|
+        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
+        [0-9]+(?={SKIP_RE}:)|
          !+
-        '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
+        ''', fix_kv, code)
  
  
  def qualities(quality_ids):
@@ -3543,7 +3559,7 @@ def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
      COMPATIBLE_CODECS = {
          'mp4': {
              'av1', 'hevc', 'avc1', 'mp4a',  # fourcc (m3u8, mpd)
-            'h264', 'aacl',  # Set in ISM
+            'h264', 'aacl', 'ec-3',  # Set in ISM
          },
          'webm': {
              'av1', 'vp9', 'vp8', 'opus', 'vrbs',
@@ -3790,6 +3806,9 @@ def __init__(self, chapters, ranges):
          self.chapters, self.ranges = chapters, ranges
  
      def __call__(self, info_dict, ydl):
+        if not self.ranges and not self.chapters:
+            yield {}
+
          warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
                     else 'Cannot match chapters since chapter information is unavailable')
          for regex in self.chapters or []:
@@ -5284,108 +5303,172 @@ def load_plugins(name, suffix, namespace):
  
  
  def traverse_obj(
-        obj, *path_list, default=None, expected_type=None, get_all=True,
+        obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
          casesense=True, is_user_input=False, traverse_string=False):
-    ''' Traverse nested list/dict/tuple
-    @param path_list        A list of paths which are checked one by one.
-                            Each path is a list of keys where each key is a:
-                              - None:     Do nothing
-                              - string:   A dictionary key / regex group
-                              - int:      An index into a list
-                              - tuple:    A list of keys all of which will be traversed
-                              - Ellipsis: Fetch all values in the object
-                              - Function: Takes the key and value as arguments
-                                          and returns whether the key matches or not
-    @param default          Default value to return
-    @param expected_type    Only accept final value of this type (Can also be any callable)
-    @param get_all          Return all the values obtained from a path or only the first one
-    @param casesense        Whether to consider dictionary keys as case sensitive
-
-    The following are only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API
-
-    @param path_list        In addition to the above,
-                              - dict:     Given {k:v, ...}; return {k: traverse_obj(obj, v), ...}
-    @param is_user_input    Whether the keys are generated from user input. If True,
-                            strings are converted to int/slice if necessary
-    @param traverse_string  Whether to traverse inside strings. If True, any
-                            non-compatible object will also be converted into a string
-    '''  # TODO: Write tests
-    if not casesense:
-        _lower = lambda k: (k.lower() if isinstance(k, str) else k)
-        path_list = (map(_lower, variadic(path)) for path in path_list)
-
-    def _traverse_obj(obj, path, _current_depth=0):
-        nonlocal depth
-        path = tuple(variadic(path))
-        for i, key in enumerate(path):
-            if None in (key, obj):
-                return obj
-            if isinstance(key, (list, tuple)):
-                obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
-                key = ...
-
-            if key is ...:
-                obj = (obj.values() if isinstance(obj, dict)
-                       else obj if isinstance(obj, (list, tuple, LazyList))
-                       else str(obj) if traverse_string else [])
-                _current_depth += 1
-                depth = max(depth, _current_depth)
-                return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
-            elif isinstance(key, dict):
-                obj = filter_dict({k: _traverse_obj(obj, v, _current_depth) for k, v in key.items()})
-            elif callable(key):
-                if isinstance(obj, (list, tuple, LazyList)):
-                    obj = enumerate(obj)
-                elif isinstance(obj, dict):
-                    obj = obj.items()
-                else:
-                    if not traverse_string:
-                        return None
-                    obj = str(obj)
-                _current_depth += 1
-                depth = max(depth, _current_depth)
-                return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
-            elif isinstance(obj, dict) and not (is_user_input and key == ':'):
-                obj = (obj.get(key) if casesense or (key in obj)
-                       else next((v for k, v in obj.items() if _lower(k) == key), None))
-            else:
-                if is_user_input:
-                    key = (int_or_none(key) if ':' not in key
-                           else slice(*map(int_or_none, key.split(':'))))
-                    if key == slice(None):
-                        return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
-                if not isinstance(key, (int, slice)):
-                    return None
-                if not isinstance(obj, (list, tuple, LazyList)):
-                    if not traverse_string:
-                        return None
-                    obj = str(obj)
-                try:
-                    obj = obj[key]
-                except IndexError:
-                    return None
-        return obj
+    """
+    Safely traverse nested `dict`s and `Sequence`s
+
+    >>> obj = [{}, {"key": "value"}]
+    >>> traverse_obj(obj, (1, "key"))
+    "value"
+
+    Each of the provided `paths` is tested and the first producing a valid result will be returned.
+    The next path will also be tested if the path branched but no results could be found.
+    Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
+    A value of None is treated as the absence of a value.
+
+    The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
+
+    The keys in the path can be one of:
+        - `None`:           Return the current object.
+        - `str`/`int`:      Return `obj[key]`. For `re.Match, return `obj.group(key)`.
+        - `slice`:          Branch out and return all values in `obj[key]`.
+        - `Ellipsis`:       Branch out and return a list of all values.
+        - `tuple`/`list`:   Branch out and return a list of all matching values.
+                            Read as: `[traverse_obj(obj, branch) for branch in branches]`.
+        - `function`:       Branch out and return values filtered by the function.
+                            Read as: `[value for key, value in obj if function(key, value)]`.
+                            For `Sequence`s, `key` is the index of the value.
+        - `dict`            Transform the current object and return a matching dict.
+                            Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
+
+        `tuple`, `list`, and `dict` all support nested paths and branches.
+
+    @params paths           Paths which to traverse by.
+    @param default          Value to return if the paths do not match.
+    @param expected_type    If a `type`, only accept final values of this type.
+                            If any other callable, try to call the function on each result.
+    @param get_all          If `False`, return the first matching result, otherwise all matching ones.
+    @param casesense        If `False`, consider string dictionary keys as case insensitive.
+
+    The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
+
+    @param is_user_input    Whether the keys are generated from user input.
+                            If `True` strings get converted to `int`/`slice` if needed.
+    @param traverse_string  Whether to traverse into objects as strings.
+                            If `True`, any non-compatible object will first be
+                            converted into a string and then traversed into.
+
+
+    @returns                The result of the object traversal.
+                            If successful, `get_all=True`, and the path branches at least once,
+                            then a list of results is returned instead.
+                            A list is always returned if the last path branches and no `default` is given.
+    """
+    is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
+    casefold = lambda k: k.casefold() if isinstance(k, str) else k
  
      if isinstance(expected_type, type):
          type_test = lambda val: val if isinstance(val, expected_type) else None
      else:
-        type_test = expected_type or IDENTITY
-
-    for path in path_list:
-        depth = 0
-        val = _traverse_obj(obj, path)
-        if val is not None:
-            if depth:
-                for _ in range(depth - 1):
-                    val = itertools.chain.from_iterable(v for v in val if v is not None)
-                val = [v for v in map(type_test, val) if v is not None]
-                if val:
-                    return val if get_all else val[0]
+        type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
+
+    def apply_key(key, obj):
+        if obj is None:
+            return
+
+        elif key is None:
+            yield obj
+
+        elif isinstance(key, (list, tuple)):
+            for branch in key:
+                _, result = apply_path(obj, branch)
+                yield from result
+
+        elif key is ...:
+            if isinstance(obj, collections.abc.Mapping):
+                yield from obj.values()
+            elif is_sequence(obj):
+                yield from obj
+            elif isinstance(obj, re.Match):
+                yield from obj.groups()
+            elif traverse_string:
+                yield from str(obj)
+
+        elif callable(key):
+            if is_sequence(obj):
+                iter_obj = enumerate(obj)
+            elif isinstance(obj, collections.abc.Mapping):
+                iter_obj = obj.items()
+            elif isinstance(obj, re.Match):
+                iter_obj = enumerate((obj.group(), *obj.groups()))
+            elif traverse_string:
+                iter_obj = enumerate(str(obj))
              else:
-                val = type_test(val)
-                if val is not None:
-                    return val
-    return default
+                return
+            yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
+
+        elif isinstance(key, dict):
+            iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
+            yield {k: v if v is not None else default for k, v in iter_obj
+                   if v is not None or default is not NO_DEFAULT}
+
+        elif isinstance(obj, collections.abc.Mapping):
+            yield (obj.get(key) if casesense or (key in obj)
+                   else next((v for k, v in obj.items() if casefold(k) == key), None))
+
+        elif isinstance(obj, re.Match):
+            if isinstance(key, int) or casesense:
+                with contextlib.suppress(IndexError):
+                    yield obj.group(key)
+                    return
+
+            if not isinstance(key, str):
+                return
+
+            yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
+
+        else:
+            if is_user_input:
+                key = (int_or_none(key) if ':' not in key
+                       else slice(*map(int_or_none, key.split(':'))))
+
+            if not isinstance(key, (int, slice)):
+                return
+
+            if not is_sequence(obj):
+                if not traverse_string:
+                    return
+                obj = str(obj)
+
+            with contextlib.suppress(IndexError):
+                yield obj[key]
+
+    def apply_path(start_obj, path):
+        objs = (start_obj,)
+        has_branched = False
+
+        for key in variadic(path):
+            if is_user_input and key == ':':
+                key = ...
+
+            if not casesense and isinstance(key, str):
+                key = key.casefold()
+
+            if key is ... or isinstance(key, (list, tuple)) or callable(key):
+                has_branched = True
+
+            key_func = functools.partial(apply_key, key)
+            objs = itertools.chain.from_iterable(map(key_func, objs))
+
+        return has_branched, objs
+
+    def _traverse_obj(obj, path, use_list=True):
+        has_branched, results = apply_path(obj, path)
+        results = LazyList(x for x in map(type_test, results) if x is not None)
+
+        if get_all and has_branched:
+            return results.exhaust() if results or use_list else None
+
+        return results[0] if results else None
+
+    for index, path in enumerate(paths, 1):
+        use_list = default is NO_DEFAULT and index == len(paths)
+        result = _traverse_obj(obj, path, use_list)
+        if result is not None:
+            return result
+
+    return None if default is NO_DEFAULT else default
  
  
  def traverse_dict(dictn, keys, casesense=True):
@@ -5398,10 +5481,6 @@ def get_first(obj, keys, **kwargs):
      return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
  
  
-def variadic(x, allowed_types=(str, bytes, dict)):
-    return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
-
-
  def time_seconds(**kwargs):
      t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
      return t.timestamp()
@@ -5429,7 +5508,8 @@ def jwt_encode_hs256(payload_data, key, headers={}):
  # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
  def jwt_decode_hs256(jwt):
      header_b64, payload_b64, signature_b64 = jwt.split('.')
-    payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
+    # add trailing ='s that may have been stripped, superfluous ='s are ignored
+    payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
      return payload_data