X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/1ac7f461845b3f9c0c3a2e6a1308bf82d3e8e55a..54a63e80af82791d2f0985bd0176bb182963fd5f:/yt_dlp/jsinterp.py diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 2bb4acf3e..5c82de19e 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -9,6 +9,7 @@ from .utils import ( NO_DEFAULT, ExtractorError, + function_with_repr, js_to_json, remove_quotes, truncate_string, @@ -19,7 +20,12 @@ def _js_bit_op(op): def zeroise(x): - return 0 if x in (None, JS_Undefined) else x + if x in (None, JS_Undefined): + return 0 + with contextlib.suppress(TypeError): + if math.isnan(x): # NB: NaN cannot be checked by membership + return 0 + return x def wrapped(a, b): return op(zeroise(a), zeroise(b)) & 0xffffffff @@ -38,7 +44,7 @@ def wrapped(a, b): def _js_div(a, b): - if JS_Undefined in (a, b) or not (a and b): + if JS_Undefined in (a, b) or not (a or b): return float('nan') return (a or 0) / b if b else float('inf') @@ -117,8 +123,8 @@ def _js_ternary(cndn, if_true=True, if_false=False): '-': _js_arith_op(operator.sub), '*': _js_arith_op(operator.mul), - '/': _js_div, '%': _js_mod, + '/': _js_div, '**': _js_exp, } @@ -184,7 +190,8 @@ def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion) raise if cls.ENABLED and stmt.strip(): - cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion) + if should_ret or repr(ret) != stmt: + cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion) return ret, should_ret return interpret_statement @@ -205,13 +212,11 @@ class JSInterpreter: 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string } - _EXC_NAME = '__yt_dlp_exception__' - def __init__(self, code, objects=None): self.code, self._functions = code, {} self._objects = {} if objects is None else objects - class Exception(ExtractorError): + class Exception(ExtractorError): # noqa: A001 def __init__(self, msg, expr=None, *args, **kwargs): if expr is not None: msg = f'{msg.rstrip()} in: {truncate_string(expr, 50, 50)}' @@ -220,6 +225,8 @@ def __init__(self, msg, expr=None, *args, **kwargs): def _named_object(self, namespace, obj): self.__named_object_counter += 1 name = f'__yt_dlp_jsinterp_obj{self.__named_object_counter}' + if callable(obj) and not isinstance(obj, function_with_repr): + obj = function_with_repr(obj, f'F<{self.__named_object_counter}>') namespace[name] = obj return name @@ -228,7 +235,7 @@ def _regex_flags(cls, expr): flags = 0 if not expr: return flags, expr - for idx, ch in enumerate(expr): + for idx, ch in enumerate(expr): # noqa: B007 if ch not in cls._RE_FLAGS: break flags |= cls._RE_FLAGS[ch] @@ -236,7 +243,7 @@ def _regex_flags(cls, expr): @staticmethod def _separate(expr, delim=',', max_split=None): - OP_CHARS = '+-*/%&|^=<>!,;{}:' + OP_CHARS = '+-*/%&|^=<>!,;{}:[' if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} @@ -246,7 +253,9 @@ def _separate(expr, delim=',', max_split=None): if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 elif not in_quote and char in counters: - counters[char] -= 1 + # Something's wrong if we get negative, but ignore it anyway + if counters[char]: + counters[char] -= 1 elif not escaping: if char in _QUOTES and in_quote in (char, None): if in_quote or after_op or char != '/': @@ -254,9 +263,11 @@ def _separate(expr, delim=',', max_split=None): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op) + in_unary_op = (not in_quote and not in_regex_char_group + and after_op not in (True, False) and char in '-+') + after_op = char if (not in_quote and char in OP_CHARS) else (char.isspace() and after_op) - if char != delim[pos] or any(counters.values()) or in_quote: + if char != delim[pos] or any(counters.values()) or in_quote or in_unary_op: pos = 0 continue elif pos != delim_len: @@ -341,7 +352,10 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': flags, outer = self._regex_flags(outer) - inner = re.compile(inner[1:], flags=flags) + # We don't support regex methods yet, so no point compiling it + inner = f'{inner}/{flags}' + # Avoid https://github.com/python/cpython/issues/74534 + # inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags) else: inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) if not outer: @@ -352,11 +366,11 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100): obj = expr[4:] if obj.startswith('Date('): left, right = self._separate_at_paren(obj[4:]) - expr = unified_timestamp( + date = unified_timestamp( self.interpret_expression(left, local_vars, allow_recursion), False) - if not expr: + if date is None: raise self.Exception(f'Failed to parse date {left!r}', expr) - expr = self._dump(int(expr * 1000), local_vars) + right + expr = self._dump(int(date * 1000), local_vars) + right else: raise self.Exception(f'Unsupported object {obj}', expr) @@ -400,10 +414,25 @@ def dict_item(key, val): m = re.match(r'''(?x) (?Ptry)\s*\{| + (?Pif)\s*\(| (?Pswitch)\s*\(| (?Pfor)\s*\( ''', expr) md = m.groupdict() if m else {} + if md.get('if'): + cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) + if_expr, expr = self._separate_at_paren(expr.lstrip()) + # TODO: "else if" is not handled + else_expr = None + m = re.match(r'else\s*{', expr) + if m: + else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) + ret, should_abort = self.interpret_statement( + if_expr if cndn else else_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + if md.get('try'): try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) err = None @@ -416,7 +445,7 @@ def dict_item(key, val): err = e pending = (None, False) - m = re.match(r'catch\s*(?P\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + m = re.match(fr'catch\s*(?P\(\s*{_NAME_RE}\s*\))?\{{', expr) if m: sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) if err: @@ -445,7 +474,7 @@ def dict_item(key, val): if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining) else: - switch_m = re.match(r'switch\s*\(', remaining) # FIXME + switch_m = re.match(r'switch\s*\(', remaining) # FIXME: ? if switch_m: switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) body, expr = self._separate_at_paren(remaining, '}') @@ -556,9 +585,9 @@ def dict_item(key, val): return int(expr), should_return elif expr == 'break': - raise JS_Break() + raise JS_Break elif expr == 'continue': - raise JS_Continue() + raise JS_Continue elif expr == 'undefined': return JS_Undefined, should_return elif expr == 'NaN': @@ -668,12 +697,12 @@ def eval_method(): elif member == 'splice': assertion(isinstance(obj, list), 'must be applied on a list') assertion(argvals, 'takes one or more arguments') - index, howMany = map(int, (argvals + [len(obj)])[:2]) + index, how_many = map(int, ([*argvals, len(obj)])[:2]) if index < 0: index += len(obj) add_items = argvals[2:] res = [] - for i in range(index, min(index + howMany, len(obj))): + for _ in range(index, min(index + how_many, len(obj))): res.append(obj.pop(index)) for i, item in enumerate(add_items): obj.insert(index + i, item) @@ -697,12 +726,12 @@ def eval_method(): elif member == 'forEach': assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') - f, this = (argvals + [''])[:2] + f, this = ([*argvals, ''])[:2] return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)] elif member == 'indexOf': assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') - idx, start = (argvals + [0])[:2] + idx, start = ([*argvals, 0])[:2] try: return obj.index(idx, start) except ValueError: @@ -750,7 +779,7 @@ def extract_object(self, objname): obj = {} obj_m = re.search( r'''(?x) - (?(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) }\s*; ''' % (re.escape(objname), _FUNC_NAME_RE), @@ -766,7 +795,8 @@ def extract_object(self, objname): fields) for f in fields_m: argnames = f.group('args').split(',') - obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) + name = remove_quotes(f.group('key')) + obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), f'F<{name}>') return obj @@ -782,13 +812,15 @@ def extract_function_code(self, funcname): \((?P[^)]*)\)\s* (?P{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code')) if func_m is None: raise self.Exception(f'Could not find JS function "{funcname}"') + code, _ = self._separate_at_paren(func_m.group('code')) return [x.strip() for x in func_m.group('args').split(',')], code def extract_function(self, funcname): - return self.extract_function_from_code(*self.extract_function_code(funcname)) + return function_with_repr( + self.extract_function_from_code(*self.extract_function_code(funcname)), + f'F<{funcname}>') def extract_function_from_code(self, argnames, code, *global_stack): local_vars = {}