]>
Commit | Line | Data |
---|---|---|
52f5be1f | 1 | from __future__ import annotations |
2 | ||
3 | import io | |
4 | import math | |
5 | import urllib.parse | |
6 | ||
7 | from ._helper import InstanceStoreMixin, select_proxy | |
8 | from .common import ( | |
9 | Features, | |
10 | Request, | |
11 | Response, | |
12 | register_preference, | |
13 | register_rh, | |
14 | ) | |
15 | from .exceptions import ( | |
16 | CertificateVerifyError, | |
17 | HTTPError, | |
18 | IncompleteRead, | |
19 | ProxyError, | |
20 | SSLError, | |
21 | TransportError, | |
22 | ) | |
23 | from .impersonate import ImpersonateRequestHandler, ImpersonateTarget | |
3c7a287e | 24 | from ..dependencies import curl_cffi, certifi |
52f5be1f | 25 | from ..utils import int_or_none |
26 | ||
27 | if curl_cffi is None: | |
28 | raise ImportError('curl_cffi is not installed') | |
29 | ||
30 | curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.')) | |
31 | ||
32 | if curl_cffi_version != (0, 5, 10): | |
33 | curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' | |
34 | raise ImportError('Only curl_cffi 0.5.10 is supported') | |
35 | ||
36 | import curl_cffi.requests | |
37 | from curl_cffi.const import CurlECode, CurlOpt | |
38 | ||
39 | ||
40 | class CurlCFFIResponseReader(io.IOBase): | |
41 | def __init__(self, response: curl_cffi.requests.Response): | |
42 | self._response = response | |
43 | self._iterator = response.iter_content() | |
44 | self._buffer = b'' | |
45 | self.bytes_read = 0 | |
46 | ||
47 | def readable(self): | |
48 | return True | |
49 | ||
50 | def read(self, size=None): | |
51 | exception_raised = True | |
52 | try: | |
53 | while self._iterator and (size is None or len(self._buffer) < size): | |
54 | chunk = next(self._iterator, None) | |
55 | if chunk is None: | |
56 | self._iterator = None | |
57 | break | |
58 | self._buffer += chunk | |
59 | self.bytes_read += len(chunk) | |
60 | ||
61 | if size is None: | |
62 | size = len(self._buffer) | |
63 | data = self._buffer[:size] | |
64 | self._buffer = self._buffer[size:] | |
65 | ||
66 | # "free" the curl instance if the response is fully read. | |
67 | # curl_cffi doesn't do this automatically and only allows one open response per thread | |
68 | if not self._iterator and not self._buffer: | |
69 | self.close() | |
70 | exception_raised = False | |
71 | return data | |
72 | finally: | |
73 | if exception_raised: | |
74 | self.close() | |
75 | ||
76 | def close(self): | |
77 | if not self.closed: | |
78 | self._response.close() | |
79 | self._buffer = b'' | |
80 | super().close() | |
81 | ||
82 | ||
83 | class CurlCFFIResponseAdapter(Response): | |
84 | fp: CurlCFFIResponseReader | |
85 | ||
86 | def __init__(self, response: curl_cffi.requests.Response): | |
87 | super().__init__( | |
88 | fp=CurlCFFIResponseReader(response), | |
89 | headers=response.headers, | |
90 | url=response.url, | |
91 | status=response.status_code) | |
92 | ||
93 | def read(self, amt=None): | |
94 | try: | |
95 | return self.fp.read(amt) | |
96 | except curl_cffi.requests.errors.RequestsError as e: | |
97 | if e.code == CurlECode.PARTIAL_FILE: | |
98 | content_length = int_or_none(e.response.headers.get('Content-Length')) | |
99 | raise IncompleteRead( | |
100 | partial=self.fp.bytes_read, | |
101 | expected=content_length - self.fp.bytes_read if content_length is not None else None, | |
102 | cause=e) from e | |
103 | raise TransportError(cause=e) from e | |
104 | ||
105 | ||
106 | @register_rh | |
107 | class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): | |
108 | RH_NAME = 'curl_cffi' | |
109 | _SUPPORTED_URL_SCHEMES = ('http', 'https') | |
110 | _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) | |
111 | _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') | |
112 | _SUPPORTED_IMPERSONATE_TARGET_MAP = { | |
113 | ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110, | |
114 | ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107, | |
115 | ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104, | |
116 | ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101, | |
117 | ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100, | |
118 | ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99, | |
119 | ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101, | |
120 | ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99, | |
121 | ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5, | |
122 | ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3, | |
123 | ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android, | |
124 | } | |
125 | ||
126 | def _create_instance(self, cookiejar=None): | |
127 | return curl_cffi.requests.Session(cookies=cookiejar) | |
128 | ||
129 | def _check_extensions(self, extensions): | |
130 | super()._check_extensions(extensions) | |
131 | extensions.pop('impersonate', None) | |
132 | extensions.pop('cookiejar', None) | |
133 | extensions.pop('timeout', None) | |
134 | ||
bec9a59e | 135 | def send(self, request: Request) -> Response: |
136 | target = self._get_request_target(request) | |
137 | try: | |
138 | response = super().send(request) | |
139 | except HTTPError as e: | |
140 | e.response.extensions['impersonate'] = target | |
141 | raise | |
142 | response.extensions['impersonate'] = target | |
143 | return response | |
144 | ||
52f5be1f | 145 | def _send(self, request: Request): |
146 | max_redirects_exceeded = False | |
147 | session: curl_cffi.requests.Session = self._get_instance( | |
148 | cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None) | |
149 | ||
150 | if self.verbose: | |
151 | session.curl.setopt(CurlOpt.VERBOSE, 1) | |
152 | ||
153 | proxies = self._get_proxies(request) | |
154 | if 'no' in proxies: | |
155 | session.curl.setopt(CurlOpt.NOPROXY, proxies['no']) | |
156 | proxies.pop('no', None) | |
157 | ||
158 | # curl doesn't support per protocol proxies, so we select the one that matches the request protocol | |
159 | proxy = select_proxy(request.url, proxies=proxies) | |
160 | if proxy: | |
161 | session.curl.setopt(CurlOpt.PROXY, proxy) | |
162 | scheme = urllib.parse.urlparse(request.url).scheme.lower() | |
163 | if scheme != 'http': | |
164 | # Enable HTTP CONNECT for HTTPS urls. | |
165 | # Don't use CONNECT for http for compatibility with urllib behaviour. | |
166 | # See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html | |
167 | session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1) | |
168 | ||
3c7a287e | 169 | # curl_cffi does not currently set these for proxies |
170 | session.curl.setopt(CurlOpt.PROXY_CAINFO, certifi.where()) | |
171 | ||
172 | if not self.verify: | |
173 | session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYPEER, 0) | |
174 | session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYHOST, 0) | |
175 | ||
52f5be1f | 176 | headers = self._get_impersonate_headers(request) |
177 | ||
178 | if self._client_cert: | |
179 | session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate']) | |
180 | client_certificate_key = self._client_cert.get('client_certificate_key') | |
181 | client_certificate_password = self._client_cert.get('client_certificate_password') | |
182 | if client_certificate_key: | |
183 | session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key) | |
184 | if client_certificate_password: | |
185 | session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password) | |
186 | ||
187 | timeout = self._calculate_timeout(request) | |
188 | ||
189 | # set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1] | |
190 | # curl_cffi does not currently do this. [2] | |
191 | # Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3] | |
192 | # [1] https://unix.stackexchange.com/a/305311 | |
193 | # [2] https://github.com/yifeikong/curl_cffi/issues/156 | |
194 | # [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html | |
195 | session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second | |
196 | session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout)) | |
197 | ||
198 | try: | |
199 | curl_response = session.request( | |
200 | method=request.method, | |
201 | url=request.url, | |
202 | headers=headers, | |
203 | data=request.data, | |
204 | verify=self.verify, | |
205 | max_redirects=5, | |
206 | timeout=timeout, | |
207 | impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get( | |
208 | self._get_request_target(request)), | |
209 | interface=self.source_address, | |
210 | stream=True | |
211 | ) | |
212 | except curl_cffi.requests.errors.RequestsError as e: | |
213 | if e.code == CurlECode.PEER_FAILED_VERIFICATION: | |
214 | raise CertificateVerifyError(cause=e) from e | |
215 | ||
216 | elif e.code == CurlECode.SSL_CONNECT_ERROR: | |
217 | raise SSLError(cause=e) from e | |
218 | ||
219 | elif e.code == CurlECode.TOO_MANY_REDIRECTS: | |
220 | max_redirects_exceeded = True | |
221 | curl_response = e.response | |
222 | ||
3c7a287e | 223 | elif ( |
224 | e.code == CurlECode.PROXY | |
225 | or (e.code == CurlECode.RECV_ERROR and 'Received HTTP code 407 from proxy after CONNECT' in str(e)) | |
226 | ): | |
52f5be1f | 227 | raise ProxyError(cause=e) from e |
228 | else: | |
229 | raise TransportError(cause=e) from e | |
230 | ||
231 | response = CurlCFFIResponseAdapter(curl_response) | |
232 | ||
233 | if not 200 <= response.status < 300: | |
234 | raise HTTPError(response, redirect_loop=max_redirects_exceeded) | |
235 | ||
236 | return response | |
237 | ||
238 | ||
239 | @register_preference(CurlCFFIRH) | |
240 | def curl_cffi_preference(rh, request): | |
241 | return -100 |