]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/jsinterp.py
[ie/mlbtv] Fix extraction (#10296)
[yt-dlp.git] / yt_dlp / jsinterp.py
index 99bdca9270b2df77c396d25e680171ac86ebfecd..a0f32892fd68f6ae8c605b41961cd7ea079aab30 100644 (file)
@@ -9,6 +9,7 @@
 from .utils import (
     NO_DEFAULT,
     ExtractorError,
+    function_with_repr,
     js_to_json,
     remove_quotes,
     truncate_string,
 
 
 def _js_bit_op(op):
+    def zeroise(x):
+        if x in (None, JS_Undefined):
+            return 0
+        with contextlib.suppress(TypeError):
+            if math.isnan(x):  # NB: NaN cannot be checked by membership
+                return 0
+        return x
+
     def wrapped(a, b):
-        def zeroise(x):
-            return 0 if x in (None, JS_Undefined) else x
-        return op(zeroise(a), zeroise(b))
+        return op(zeroise(a), zeroise(b)) & 0xffffffff
 
     return wrapped
 
@@ -37,7 +44,7 @@ def wrapped(a, b):
 
 
 def _js_div(a, b):
-    if JS_Undefined in (a, b) or not (a and b):
+    if JS_Undefined in (a, b) or not (a or b):
         return float('nan')
     return (a or 0) / b if b else float('inf')
 
@@ -71,6 +78,8 @@ def _js_comp_op(op):
     def wrapped(a, b):
         if JS_Undefined in (a, b):
             return False
+        if isinstance(a, str) or isinstance(b, str):
+            return op(str(a or 0), str(b or 0))
         return op(a or 0, b or 0)
 
     return wrapped
@@ -114,8 +123,8 @@ def _js_ternary(cndn, if_true=True, if_false=False):
     '-': _js_arith_op(operator.sub),
 
     '*': _js_arith_op(operator.mul),
-    '/': _js_div,
     '%': _js_mod,
+    '/': _js_div,
     '**': _js_exp,
 }
 
@@ -181,7 +190,8 @@ def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs
                     cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion)
                 raise
             if cls.ENABLED and stmt.strip():
-                cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion)
+                if should_ret or repr(ret) != stmt:
+                    cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion)
             return ret, should_ret
         return interpret_statement
 
@@ -202,13 +212,11 @@ class JSInterpreter:
         'y': 4096,  # Perform a "sticky" search that matches starting at the current position in the target string
     }
 
-    _EXC_NAME = '__yt_dlp_exception__'
-
     def __init__(self, code, objects=None):
         self.code, self._functions = code, {}
         self._objects = {} if objects is None else objects
 
-    class Exception(ExtractorError):
+    class Exception(ExtractorError):  # noqa: A001
         def __init__(self, msg, expr=None, *args, **kwargs):
             if expr is not None:
                 msg = f'{msg.rstrip()} in: {truncate_string(expr, 50, 50)}'
@@ -217,6 +225,8 @@ def __init__(self, msg, expr=None, *args, **kwargs):
     def _named_object(self, namespace, obj):
         self.__named_object_counter += 1
         name = f'__yt_dlp_jsinterp_obj{self.__named_object_counter}'
+        if callable(obj) and not isinstance(obj, function_with_repr):
+            obj = function_with_repr(obj, f'F<{self.__named_object_counter}>')
         namespace[name] = obj
         return name
 
@@ -225,7 +235,7 @@ def _regex_flags(cls, expr):
         flags = 0
         if not expr:
             return flags, expr
-        for idx, ch in enumerate(expr):
+        for idx, ch in enumerate(expr):  # noqa: B007
             if ch not in cls._RE_FLAGS:
                 break
             flags |= cls._RE_FLAGS[ch]
@@ -233,7 +243,7 @@ def _regex_flags(cls, expr):
 
     @staticmethod
     def _separate(expr, delim=',', max_split=None):
-        OP_CHARS = '+-*/%&|^=<>!,;{}:'
+        OP_CHARS = '+-*/%&|^=<>!,;{}:['
         if not expr:
             return
         counters = {k: 0 for k in _MATCHING_PARENS.values()}
@@ -243,16 +253,21 @@ def _separate(expr, delim=',', max_split=None):
             if not in_quote and char in _MATCHING_PARENS:
                 counters[_MATCHING_PARENS[char]] += 1
             elif not in_quote and char in counters:
-                counters[char] -= 1
-            elif not escaping and char in _QUOTES and in_quote in (char, None):
-                if in_quote or after_op or char != '/':
-                    in_quote = None if in_quote and not in_regex_char_group else char
-            elif in_quote == '/' and char in '[]':
-                in_regex_char_group = char == '['
+                # Something's wrong if we get negative, but ignore it anyway
+                if counters[char]:
+                    counters[char] -= 1
+            elif not escaping:
+                if char in _QUOTES and in_quote in (char, None):
+                    if in_quote or after_op or char != '/':
+                        in_quote = None if in_quote and not in_regex_char_group else char
+                elif in_quote == '/' and char in '[]':
+                    in_regex_char_group = char == '['
             escaping = not escaping and in_quote and char == '\\'
-            after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op)
+            in_unary_op = (not in_quote and not in_regex_char_group
+                           and after_op not in (True, False) and char in '-+')
+            after_op = char if (not in_quote and char in OP_CHARS) else (char.isspace() and after_op)
 
-            if char != delim[pos] or any(counters.values()) or in_quote:
+            if char != delim[pos] or any(counters.values()) or in_quote or in_unary_op:
                 pos = 0
                 continue
             elif pos != delim_len:
@@ -266,7 +281,9 @@ def _separate(expr, delim=',', max_split=None):
         yield expr[start:]
 
     @classmethod
-    def _separate_at_paren(cls, expr, delim):
+    def _separate_at_paren(cls, expr, delim=None):
+        if delim is None:
+            delim = expr and _MATCHING_PARENS[expr[0]]
         separated = list(cls._separate(expr, delim, 1))
         if len(separated) < 2:
             raise cls.Exception(f'No terminating paren {delim}', expr)
@@ -335,7 +352,10 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100):
             inner, outer = self._separate(expr, expr[0], 1)
             if expr[0] == '/':
                 flags, outer = self._regex_flags(outer)
-                inner = re.compile(inner[1:], flags=flags)
+                # We don't support regex methods yet, so no point compiling it
+                inner = f'{inner}/{flags}'
+                # Avoid https://github.com/python/cpython/issues/74534
+                # inner = re.compile(inner[1:].replace('[[', r'[\['), flags=flags)
             else:
                 inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True))
             if not outer:
@@ -345,12 +365,12 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100):
         if expr.startswith('new '):
             obj = expr[4:]
             if obj.startswith('Date('):
-                left, right = self._separate_at_paren(obj[4:], ')')
-                expr = unified_timestamp(
+                left, right = self._separate_at_paren(obj[4:])
+                date = unified_timestamp(
                     self.interpret_expression(left, local_vars, allow_recursion), False)
-                if not expr:
+                if date is None:
                     raise self.Exception(f'Failed to parse date {left!r}', expr)
-                expr = self._dump(int(expr * 1000), local_vars) + right
+                expr = self._dump(int(date * 1000), local_vars) + right
             else:
                 raise self.Exception(f'Unsupported object {obj}', expr)
 
@@ -359,8 +379,8 @@ def interpret_statement(self, stmt, local_vars, allow_recursion=100):
             return None, should_return
 
         if expr.startswith('{'):
-            inner, outer = self._separate_at_paren(expr, '}')
-            # Look for Map first
+            inner, outer = self._separate_at_paren(expr)
+            # try for object expression (Map)
             sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)]
             if all(len(sub_expr) == 2 for sub_expr in sub_expressions):
                 def dict_item(key, val):
@@ -378,7 +398,7 @@ def dict_item(key, val):
                 expr = self._dump(inner, local_vars) + outer
 
         if expr.startswith('('):
-            inner, outer = self._separate_at_paren(expr, ')')
+            inner, outer = self._separate_at_paren(expr)
             inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion)
             if not outer or should_abort:
                 return inner, should_abort or should_return
@@ -386,53 +406,77 @@ def dict_item(key, val):
                 expr = self._dump(inner, local_vars) + outer
 
         if expr.startswith('['):
-            inner, outer = self._separate_at_paren(expr, ']')
+            inner, outer = self._separate_at_paren(expr)
             name = self._named_object(local_vars, [
                 self.interpret_expression(item, local_vars, allow_recursion)
                 for item in self._separate(inner)])
             expr = name + outer
 
-        m = re.match(rf'''(?x)
-            (?P<try>try|finally)\s*|
-            (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))|
-            (?P<switch>switch)\s*\(|
-            (?P<for>for)\s*\(|''', expr)
-        if m and m.group('try'):
-            if expr[m.end()] == '{':
-                try_expr, expr = self._separate_at_paren(expr[m.end():], '}')
-            else:
-                try_expr, expr = expr[m.end() - 1:], ''
+        m = re.match(r'''(?x)
+                (?P<try>try)\s*\{|
+                (?P<if>if)\s*\(|
+                (?P<switch>switch)\s*\(|
+                (?P<for>for)\s*\(
+                ''', expr)
+        md = m.groupdict() if m else {}
+        if md.get('if'):
+            cndn, expr = self._separate_at_paren(expr[m.end() - 1:])
+            if_expr, expr = self._separate_at_paren(expr.lstrip())
+            # TODO: "else if" is not handled
+            else_expr = None
+            m = re.match(r'else\s*{', expr)
+            if m:
+                else_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
+            cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion))
+            ret, should_abort = self.interpret_statement(
+                if_expr if cndn else else_expr, local_vars, allow_recursion)
+            if should_abort:
+                return ret, True
+
+        if md.get('try'):
+            try_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
+            err = None
             try:
                 ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion)
                 if should_abort:
                     return ret, True
-            except JS_Throw as e:
-                local_vars[self._EXC_NAME] = e.error
             except Exception as e:
                 # XXX: This works for now, but makes debugging future issues very hard
-                local_vars[self._EXC_NAME] = e
-            ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
-            return ret, should_abort or should_return
-
-        elif m and m.group('catch'):
-            catch_expr, expr = self._separate_at_paren(expr[m.end():], '}')
-            if self._EXC_NAME in local_vars:
-                catch_vars = local_vars.new_child({m.group('err'): local_vars.pop(self._EXC_NAME)})
-                ret, should_abort = self.interpret_statement(catch_expr, catch_vars, allow_recursion)
+                err = e
+
+            pending = (None, False)
+            m = re.match(fr'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{', expr)
+            if m:
+                sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
+                if err:
+                    catch_vars = {}
+                    if m.group('err'):
+                        catch_vars[m.group('err')] = err.error if isinstance(err, JS_Throw) else err
+                    catch_vars = local_vars.new_child(catch_vars)
+                    err, pending = None, self.interpret_statement(sub_expr, catch_vars, allow_recursion)
+
+            m = re.match(r'finally\s*\{', expr)
+            if m:
+                sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:])
+                ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion)
                 if should_abort:
                     return ret, True
 
-            ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
-            return ret, should_abort or should_return
+            ret, should_abort = pending
+            if should_abort:
+                return ret, True
+
+            if err:
+                raise err
 
-        elif m and m.group('for'):
-            constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
+        elif md.get('for'):
+            constructor, remaining = self._separate_at_paren(expr[m.end() - 1:])
             if remaining.startswith('{'):
-                body, expr = self._separate_at_paren(remaining, '}')
+                body, expr = self._separate_at_paren(remaining)
             else:
-                switch_m = re.match(r'switch\s*\(', remaining)  # FIXME
+                switch_m = re.match(r'switch\s*\(', remaining)  # FIXME: ?
                 if switch_m:
-                    switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:], ')')
+                    switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:])
                     body, expr = self._separate_at_paren(remaining, '}')
                     body = 'switch(%s){%s}' % (switch_val, body)
                 else:
@@ -451,11 +495,9 @@ def dict_item(key, val):
                 except JS_Continue:
                     pass
                 self.interpret_expression(increment, local_vars, allow_recursion)
-            ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
-            return ret, should_abort or should_return
 
-        elif m and m.group('switch'):
-            switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
+        elif md.get('switch'):
+            switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:])
             switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion)
             body, expr = self._separate_at_paren(remaining, '}')
             items = body.replace('default:', 'case default:').split('case ')[1:]
@@ -478,6 +520,8 @@ def dict_item(key, val):
                         break
                 if matched:
                     break
+
+        if md:
             ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion)
             return ret, should_abort or should_return
 
@@ -541,9 +585,9 @@ def dict_item(key, val):
             return int(expr), should_return
 
         elif expr == 'break':
-            raise JS_Break()
+            raise JS_Break
         elif expr == 'continue':
-            raise JS_Continue()
+            raise JS_Continue
         elif expr == 'undefined':
             return JS_Undefined, should_return
         elif expr == 'NaN':
@@ -582,7 +626,7 @@ def dict_item(key, val):
                 member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion)
             arg_str = expr[m.end():]
             if arg_str.startswith('('):
-                arg_str, remaining = self._separate_at_paren(arg_str, ')')
+                arg_str, remaining = self._separate_at_paren(arg_str)
             else:
                 arg_str, remaining = None, arg_str
 
@@ -623,12 +667,12 @@ def eval_method():
                     self.interpret_expression(v, local_vars, allow_recursion)
                     for v in self._separate(arg_str)]
 
-                if obj == str:
+                if obj is str:
                     if member == 'fromCharCode':
                         assertion(argvals, 'takes one or more arguments')
                         return ''.join(map(chr, argvals))
                     raise self.Exception(f'Unsupported String method {member}', expr)
-                elif obj == float:
+                elif obj is float:
                     if member == 'pow':
                         assertion(len(argvals) == 2, 'takes two arguments')
                         return argvals[0] ** argvals[1]
@@ -653,12 +697,12 @@ def eval_method():
                 elif member == 'splice':
                     assertion(isinstance(obj, list), 'must be applied on a list')
                     assertion(argvals, 'takes one or more arguments')
-                    index, howMany = map(int, (argvals + [len(obj)])[:2])
+                    index, how_many = map(int, ([*argvals, len(obj)])[:2])
                     if index < 0:
                         index += len(obj)
                     add_items = argvals[2:]
                     res = []
-                    for i in range(index, min(index + howMany, len(obj))):
+                    for _ in range(index, min(index + how_many, len(obj))):
                         res.append(obj.pop(index))
                     for i, item in enumerate(add_items):
                         obj.insert(index + i, item)
@@ -682,16 +726,23 @@ def eval_method():
                 elif member == 'forEach':
                     assertion(argvals, 'takes one or more arguments')
                     assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
-                    f, this = (argvals + [''])[:2]
+                    f, this = ([*argvals, ''])[:2]
                     return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)]
                 elif member == 'indexOf':
                     assertion(argvals, 'takes one or more arguments')
                     assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
-                    idx, start = (argvals + [0])[:2]
+                    idx, start = ([*argvals, 0])[:2]
                     try:
                         return obj.index(idx, start)
                     except ValueError:
                         return -1
+                elif member == 'charCodeAt':
+                    assertion(isinstance(obj, str), 'must be applied on a string')
+                    assertion(len(argvals) == 1, 'takes exactly one argument')
+                    idx = argvals[0] if isinstance(argvals[0], int) else 0
+                    if idx >= len(obj):
+                        return None
+                    return ord(obj[idx])
 
                 idx = int(member) if isinstance(obj, list) else member
                 return obj[idx](argvals, allow_recursion=allow_recursion)
@@ -728,7 +779,7 @@ def extract_object(self, objname):
         obj = {}
         obj_m = re.search(
             r'''(?x)
-                (?<!this\.)%s\s*=\s*{\s*
+                (?<!\.)%s\s*=\s*{\s*
                     (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
                 }\s*;
             ''' % (re.escape(objname), _FUNC_NAME_RE),
@@ -744,7 +795,8 @@ def extract_object(self, objname):
             fields)
         for f in fields_m:
             argnames = f.group('args').split(',')
-            obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
+            name = remove_quotes(f.group('key'))
+            obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), f'F<{name}>')
 
         return obj
 
@@ -760,13 +812,15 @@ def extract_function_code(self, funcname):
                 \((?P<args>[^)]*)\)\s*
                 (?P<code>{.+})''' % {'name': re.escape(funcname)},
             self.code)
-        code, _ = self._separate_at_paren(func_m.group('code'), '}')
         if func_m is None:
             raise self.Exception(f'Could not find JS function "{funcname}"')
+        code, _ = self._separate_at_paren(func_m.group('code'))
         return [x.strip() for x in func_m.group('args').split(',')], code
 
     def extract_function(self, funcname):
-        return self.extract_function_from_code(*self.extract_function_code(funcname))
+        return function_with_repr(
+            self.extract_function_from_code(*self.extract_function_code(funcname)),
+            f'F<{funcname}>')
 
     def extract_function_from_code(self, argnames, code, *global_stack):
         local_vars = {}
@@ -775,7 +829,7 @@ def extract_function_from_code(self, argnames, code, *global_stack):
             if mobj is None:
                 break
             start, body_start = mobj.span()
-            body, remaining = self._separate_at_paren(code[body_start - 1:], '}')
+            body, remaining = self._separate_at_paren(code[body_start - 1:])
             name = self._named_object(local_vars, self.extract_function_from_code(
                 [x.strip() for x in mobj.group('args').split(',')],
                 body, local_vars, *global_stack))