]> jfr.im git - yt-dlp.git/commitdiff
[cookies] Parse cookies leniently (#4780)
authorSimon Sawicki <redacted>
Fri, 16 Sep 2022 17:02:00 +0000 (19:02 +0200)
committerGitHub <redacted>
Fri, 16 Sep 2022 17:02:00 +0000 (22:32 +0530)
Closes #4776, #3778
Authored by: Grub4K

test/test_cookies.py
yt_dlp/cookies.py
yt_dlp/extractor/common.py

index cfeb11b5526de124a87697700306d3820e72c325..61619df2979422ac9f2e027b1257729ce14e6eff 100644 (file)
@@ -3,6 +3,7 @@
 
 from yt_dlp import cookies
 from yt_dlp.cookies import (
+    LenientSimpleCookie,
     LinuxChromeCookieDecryptor,
     MacChromeCookieDecryptor,
     WindowsChromeCookieDecryptor,
@@ -137,3 +138,148 @@ def test_safari_cookie_parsing(self):
     def test_pbkdf2_sha1(self):
         key = pbkdf2_sha1(b'peanuts', b' ' * 16, 1, 16)
         self.assertEqual(key, b'g\xe1\x8e\x0fQ\x1c\x9b\xf3\xc9`!\xaa\x90\xd9\xd34')
+
+
+class TestLenientSimpleCookie(unittest.TestCase):
+    def _run_tests(self, *cases):
+        for message, raw_cookie, expected in cases:
+            cookie = LenientSimpleCookie(raw_cookie)
+
+            with self.subTest(message, expected=expected):
+                self.assertEqual(cookie.keys(), expected.keys(), message)
+
+                for key, expected_value in expected.items():
+                    morsel = cookie[key]
+                    if isinstance(expected_value, tuple):
+                        expected_value, expected_attributes = expected_value
+                    else:
+                        expected_attributes = {}
+
+                    attributes = {
+                        key: value
+                        for key, value in dict(morsel).items()
+                        if value != ""
+                    }
+                    self.assertEqual(attributes, expected_attributes, message)
+
+                    self.assertEqual(morsel.value, expected_value, message)
+
+    def test_parsing(self):
+        self._run_tests(
+            # Copied from https://github.com/python/cpython/blob/v3.10.7/Lib/test/test_http_cookies.py
+            (
+                "Test basic cookie",
+                "chips=ahoy; vienna=finger",
+                {"chips": "ahoy", "vienna": "finger"},
+            ),
+            (
+                "Test quoted cookie",
+                'keebler="E=mc2; L=\\"Loves\\"; fudge=\\012;"',
+                {"keebler": 'E=mc2; L="Loves"; fudge=\012;'},
+            ),
+            (
+                "Allow '=' in an unquoted value",
+                "keebler=E=mc2",
+                {"keebler": "E=mc2"},
+            ),
+            (
+                "Allow cookies with ':' in their name",
+                "key:term=value:term",
+                {"key:term": "value:term"},
+            ),
+            (
+                "Allow '[' and ']' in cookie values",
+                "a=b; c=[; d=r; f=h",
+                {"a": "b", "c": "[", "d": "r", "f": "h"},
+            ),
+            (
+                "Test basic cookie attributes",
+                'Customer="WILE_E_COYOTE"; Version=1; Path=/acme',
+                {"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})},
+            ),
+            (
+                "Test flag only cookie attributes",
+                'Customer="WILE_E_COYOTE"; HttpOnly; Secure',
+                {"Customer": ("WILE_E_COYOTE", {"httponly": True, "secure": True})},
+            ),
+            (
+                "Test flag only attribute with values",
+                "eggs=scrambled; httponly=foo; secure=bar; Path=/bacon",
+                {"eggs": ("scrambled", {"httponly": "foo", "secure": "bar", "path": "/bacon"})},
+            ),
+            (
+                "Test special case for 'expires' attribute, 4 digit year",
+                'Customer="W"; expires=Wed, 01 Jan 2010 00:00:00 GMT',
+                {"Customer": ("W", {"expires": "Wed, 01 Jan 2010 00:00:00 GMT"})},
+            ),
+            (
+                "Test special case for 'expires' attribute, 2 digit year",
+                'Customer="W"; expires=Wed, 01 Jan 98 00:00:00 GMT',
+                {"Customer": ("W", {"expires": "Wed, 01 Jan 98 00:00:00 GMT"})},
+            ),
+            (
+                "Test extra spaces in keys and values",
+                "eggs  =  scrambled  ;  secure  ;  path  =  bar   ; foo=foo   ",
+                {"eggs": ("scrambled", {"secure": True, "path": "bar"}), "foo": "foo"},
+            ),
+            (
+                "Test quoted attributes",
+                'Customer="WILE_E_COYOTE"; Version="1"; Path="/acme"',
+                {"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})}
+            ),
+            # Our own tests that CPython passes
+            (
+                "Allow ';' in quoted value",
+                'chips="a;hoy"; vienna=finger',
+                {"chips": "a;hoy", "vienna": "finger"},
+            ),
+            (
+                "Keep only the last set value",
+                "a=c; a=b",
+                {"a": "b"},
+            ),
+        )
+
+    def test_lenient_parsing(self):
+        self._run_tests(
+            (
+                "Ignore and try to skip invalid cookies",
+                'chips={"ahoy;": 1}; vienna="finger;"',
+                {"vienna": "finger;"},
+            ),
+            (
+                "Ignore cookies without a name",
+                "a=b; unnamed; c=d",
+                {"a": "b", "c": "d"},
+            ),
+            (
+                "Ignore '\"' cookie without name",
+                'a=b; "; c=d',
+                {"a": "b", "c": "d"},
+            ),
+            (
+                "Skip all space separated values",
+                "x a=b c=d x; e=f",
+                {"a": "b", "c": "d", "e": "f"},
+            ),
+            (
+                "Skip all space separated values",
+                'x a=b; data={"complex": "json", "with": "key=value"}; x c=d x',
+                {"a": "b", "c": "d"},
+            ),
+            (
+                "Expect quote mending",
+                'a=b; invalid="; c=d',
+                {"a": "b", "c": "d"},
+            ),
+            (
+                "Reset morsel after invalid to not capture attributes",
+                "a=b; invalid; Version=1; c=d",
+                {"a": "b", "c": "d"},
+            ),
+            (
+                "Continue after non-flag attribute without value",
+                "a=b; path; Version=1; c=d",
+                {"a": "b", "c": "d"},
+            ),
+        )
index c3b14f03bb670e9bc55470bffecccfcdad175c8f..d502e91da6cea38ef4b752c99509ff2d6f1cc1b1 100644 (file)
@@ -1,6 +1,7 @@
 import base64
 import contextlib
 import http.cookiejar
+import http.cookies
 import json
 import os
 import re
@@ -990,3 +991,98 @@ def _parse_browser_specification(browser_name, profile=None, keyring=None, conta
     if profile is not None and _is_path(profile):
         profile = os.path.expanduser(profile)
     return browser_name, profile, keyring, container
+
+
+class LenientSimpleCookie(http.cookies.SimpleCookie):
+    """More lenient version of http.cookies.SimpleCookie"""
+    # From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py
+    _LEGAL_KEY_CHARS = r"\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\="
+    _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + r"\[\]"
+
+    _RESERVED = {
+        "expires",
+        "path",
+        "comment",
+        "domain",
+        "max-age",
+        "secure",
+        "httponly",
+        "version",
+        "samesite",
+    }
+
+    _FLAGS = {"secure", "httponly"}
+
+    # Added 'bad' group to catch the remaining value
+    _COOKIE_PATTERN = re.compile(r"""
+        \s*                            # Optional whitespace at start of cookie
+        (?P<key>                       # Start of group 'key'
+        [""" + _LEGAL_KEY_CHARS + r"""]+?# Any word of at least one letter
+        )                              # End of group 'key'
+        (                              # Optional group: there may not be a value.
+        \s*=\s*                          # Equal Sign
+        (                                # Start of potential value
+        (?P<val>                           # Start of group 'val'
+        "(?:[^\\"]|\\.)*"                    # Any doublequoted string
+        |                                    # or
+        \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr
+        |                                    # or
+        [""" + _LEGAL_VALUE_CHARS + r"""]*     # Any word or empty string
+        )                                  # End of group 'val'
+        |                                  # or
+        (?P<bad>(?:\\;|[^;])*?)            # 'bad' group fallback for invalid values
+        )                                # End of potential value
+        )?                             # End of optional value group
+        \s*                            # Any number of spaces.
+        (\s+|;|$)                      # Ending either at space, semicolon, or EOS.
+        """, re.ASCII | re.VERBOSE)
+
+    def load(self, data):
+        # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4776
+        if not isinstance(data, str):
+            return super().load(data)
+
+        morsel = None
+        index = 0
+        length = len(data)
+
+        while 0 <= index < length:
+            match = self._COOKIE_PATTERN.search(data, index)
+            if not match:
+                break
+
+            index = match.end(0)
+            if match.group("bad"):
+                morsel = None
+                continue
+
+            key, value = match.group("key", "val")
+
+            if key[0] == "$":
+                if morsel is not None:
+                    morsel[key[1:]] = True
+                continue
+
+            lower_key = key.lower()
+            if lower_key in self._RESERVED:
+                if morsel is None:
+                    continue
+
+                if value is None:
+                    if lower_key not in self._FLAGS:
+                        morsel = None
+                        continue
+                    value = True
+                else:
+                    value, _ = self.value_decode(value)
+
+                morsel[key] = value
+
+            elif value is not None:
+                morsel = self.get(key, http.cookies.Morsel())
+                real_value, coded_value = self.value_decode(value)
+                morsel.set(key, real_value, coded_value)
+                self[key] = morsel
+
+            else:
+                morsel = None
index 30042d61feade16ca9bc48859576903f4b990c44..e8fa8fdde8b73745e4f5603fd850a4279382e42b 100644 (file)
@@ -22,6 +22,7 @@
 
 from ..compat import functools  # isort: split
 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
+from ..cookies import LenientSimpleCookie
 from ..downloader import FileDownloader
 from ..downloader.f4m import get_base_url, remove_encrypted_media
 from ..utils import (
@@ -3632,7 +3633,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
 
     def _get_cookies(self, url):
         """ Return a http.cookies.SimpleCookie with the cookies for the url """
-        return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
+        return LenientSimpleCookie(self._downloader._calc_cookies(url))
 
     def _apply_first_set_cookie_header(self, url_handle, cookie):
         """