]> jfr.im git - yt-dlp.git/blame - yt_dlp/networking/_requests.py
[ie/ARD] Overhaul extractors (#8878)
[yt-dlp.git] / yt_dlp / networking / _requests.py
CommitLineData
8a8b5452 1import contextlib
2import functools
3import http.client
4import logging
5import re
6import socket
7import warnings
8
9from ..dependencies import brotli, requests, urllib3
10from ..utils import bug_reports_message, int_or_none, variadic
11
12if requests is None:
13 raise ImportError('requests module is not installed')
14
15if urllib3 is None:
16 raise ImportError('urllib3 module is not installed')
17
18urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
19
20if urllib3_version < (1, 26, 17):
21 raise ImportError('Only urllib3 >= 1.26.17 is supported')
22
23if requests.__build__ < 0x023100:
24 raise ImportError('Only requests >= 2.31.0 is supported')
25
26import requests.adapters
27import requests.utils
28import urllib3.connection
29import urllib3.exceptions
30
31from ._helper import (
32 InstanceStoreMixin,
33 add_accept_encoding_header,
34 create_connection,
35 create_socks_proxy_socket,
36 get_redirect_method,
37 make_socks_proxy_opts,
38 select_proxy,
39)
40from .common import (
41 Features,
42 RequestHandler,
43 Response,
44 register_preference,
45 register_rh,
46)
47from .exceptions import (
48 CertificateVerifyError,
49 HTTPError,
50 IncompleteRead,
51 ProxyError,
52 RequestError,
53 SSLError,
54 TransportError,
55)
56from ..socks import ProxyError as SocksProxyError
57
58SUPPORTED_ENCODINGS = [
59 'gzip', 'deflate'
60]
61
62if brotli is not None:
63 SUPPORTED_ENCODINGS.append('br')
64
65"""
66Override urllib3's behavior to not convert lower-case percent-encoded characters
67to upper-case during url normalization process.
68
69RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
70and normalizers should convert them to uppercase for consistency [1].
71
72However, some sites may have an incorrect implementation where they provide
73a percent-encoded url that is then compared case-sensitively.[2]
74
75While this is a very rare case, since urllib does not do this normalization step, it
76is best to avoid it in requests too for compatability reasons.
77
781: https://tools.ietf.org/html/rfc3986#section-2.1
792: https://github.com/streamlink/streamlink/pull/4003
80"""
81
82
83class Urllib3PercentREOverride:
84 def __init__(self, r: re.Pattern):
85 self.re = r
86
87 # pass through all other attribute calls to the original re
88 def __getattr__(self, item):
89 return self.re.__getattribute__(item)
90
91 def subn(self, repl, string, *args, **kwargs):
92 return string, self.re.subn(repl, string, *args, **kwargs)[1]
93
94
95# urllib3 >= 1.25.8 uses subn:
96# https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
97import urllib3.util.url # noqa: E305
98
99if hasattr(urllib3.util.url, 'PERCENT_RE'):
100 urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
101elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0
102 urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
103else:
104 warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
105
106"""
107Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
108server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
109however this is an issue because we set check_hostname to True in our SSLContext.
110
111Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
112
113This has been fixed in urllib3 2.0+.
114See: https://github.com/urllib3/urllib3/issues/517
115"""
116
117if urllib3_version < (2, 0, 0):
118 with contextlib.suppress():
119 urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
120
121
122# Requests will not automatically handle no_proxy by default
123# due to buggy no_proxy handling with proxy dict [1].
124# 1. https://github.com/psf/requests/issues/5000
125requests.adapters.select_proxy = select_proxy
126
127
128class RequestsResponseAdapter(Response):
129 def __init__(self, res: requests.models.Response):
130 super().__init__(
131 fp=res.raw, headers=res.headers, url=res.url,
132 status=res.status_code, reason=res.reason)
133
134 self._requests_response = res
135
136 def read(self, amt: int = None):
137 try:
138 # Interact with urllib3 response directly.
139 return self.fp.read(amt, decode_content=True)
140
141 # See urllib3.response.HTTPResponse.read() for exceptions raised on read
142 except urllib3.exceptions.SSLError as e:
143 raise SSLError(cause=e) from e
144
8a8b5452 145 except urllib3.exceptions.ProtocolError as e:
4e38e2ae 146 # IncompleteRead is always contained within ProtocolError
8a8b5452 147 # See urllib3.response.HTTPResponse._error_catcher()
148 ir_err = next(
149 (err for err in (e.__context__, e.__cause__, *variadic(e.args))
150 if isinstance(err, http.client.IncompleteRead)), None)
151 if ir_err is not None:
4e38e2ae
SS
152 # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
153 # but uses an `int` for its `partial` property.
154 partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
155 raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
8a8b5452 156 raise TransportError(cause=e) from e
157
158 except urllib3.exceptions.HTTPError as e:
159 # catch-all for any other urllib3 response exceptions
160 raise TransportError(cause=e) from e
161
162
163class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
164 def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
165 self._pm_args = {}
166 if ssl_context:
167 self._pm_args['ssl_context'] = ssl_context
168 if source_address:
169 self._pm_args['source_address'] = (source_address, 0)
170 self._proxy_ssl_context = proxy_ssl_context or ssl_context
171 super().__init__(**kwargs)
172
173 def init_poolmanager(self, *args, **kwargs):
174 return super().init_poolmanager(*args, **kwargs, **self._pm_args)
175
176 def proxy_manager_for(self, proxy, **proxy_kwargs):
177 extra_kwargs = {}
178 if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
179 extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
180 return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
181
182 def cert_verify(*args, **kwargs):
183 # lean on SSLContext for cert verification
184 pass
185
186
187class RequestsSession(requests.sessions.Session):
188 """
189 Ensure unified redirect method handling with our urllib redirect handler.
190 """
191 def rebuild_method(self, prepared_request, response):
192 new_method = get_redirect_method(prepared_request.method, response.status_code)
193
194 # HACK: requests removes headers/body on redirect unless code was a 307/308.
195 if new_method == prepared_request.method:
196 response._real_status_code = response.status_code
197 response.status_code = 308
198
199 prepared_request.method = new_method
200
201 def rebuild_auth(self, prepared_request, response):
202 # HACK: undo status code change from rebuild_method, if applicable.
203 # rebuild_auth runs after requests would remove headers/body based on status code
204 if hasattr(response, '_real_status_code'):
205 response.status_code = response._real_status_code
206 del response._real_status_code
207 return super().rebuild_auth(prepared_request, response)
208
209
210class Urllib3LoggingFilter(logging.Filter):
211
212 def filter(self, record):
213 # Ignore HTTP request messages since HTTPConnection prints those
214 if record.msg == '%s://%s:%s "%s %s %s" %s %s':
215 return False
216 return True
217
218
219class Urllib3LoggingHandler(logging.Handler):
220 """Redirect urllib3 logs to our logger"""
221 def __init__(self, logger, *args, **kwargs):
222 super().__init__(*args, **kwargs)
223 self._logger = logger
224
225 def emit(self, record):
226 try:
227 msg = self.format(record)
228 if record.levelno >= logging.ERROR:
229 self._logger.error(msg)
230 else:
231 self._logger.stdout(msg)
232
233 except Exception:
234 self.handleError(record)
235
236
237@register_rh
238class RequestsRH(RequestHandler, InstanceStoreMixin):
239
240 """Requests RequestHandler
241 https://github.com/psf/requests
242 """
243 _SUPPORTED_URL_SCHEMES = ('http', 'https')
244 _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
245 _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
246 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
247 RH_NAME = 'requests'
248
249 def __init__(self, *args, **kwargs):
250 super().__init__(*args, **kwargs)
251
252 # Forward urllib3 debug messages to our logger
253 logger = logging.getLogger('urllib3')
254 handler = Urllib3LoggingHandler(logger=self._logger)
255 handler.setFormatter(logging.Formatter('requests: %(message)s'))
256 handler.addFilter(Urllib3LoggingFilter())
257 logger.addHandler(handler)
b012271d
SS
258 # TODO: Use a logger filter to suppress pool reuse warning instead
259 logger.setLevel(logging.ERROR)
8a8b5452 260
261 if self.verbose:
262 # Setting this globally is not ideal, but is easier than hacking with urllib3.
263 # It could technically be problematic for scripts embedding yt-dlp.
264 # However, it is unlikely debug traffic is used in that context in a way this will cause problems.
265 urllib3.connection.HTTPConnection.debuglevel = 1
266 logger.setLevel(logging.DEBUG)
267 # this is expected if we are using --no-check-certificate
268 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
269
270 def close(self):
271 self._clear_instances()
272
273 def _check_extensions(self, extensions):
274 super()._check_extensions(extensions)
275 extensions.pop('cookiejar', None)
276 extensions.pop('timeout', None)
277
278 def _create_instance(self, cookiejar):
279 session = RequestsSession()
280 http_adapter = RequestsHTTPAdapter(
281 ssl_context=self._make_sslcontext(),
282 source_address=self.source_address,
283 max_retries=urllib3.util.retry.Retry(False),
284 )
285 session.adapters.clear()
286 session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
287 session.mount('https://', http_adapter)
288 session.mount('http://', http_adapter)
289 session.cookies = cookiejar
290 session.trust_env = False # no need, we already load proxies from env
291 return session
292
293 def _send(self, request):
294
295 headers = self._merge_headers(request.headers)
296 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
297
298 max_redirects_exceeded = False
299
300 session = self._get_instance(
301 cookiejar=request.extensions.get('cookiejar') or self.cookiejar)
302
303 try:
304 requests_res = session.request(
305 method=request.method,
306 url=request.url,
307 data=request.data,
308 headers=headers,
309 timeout=float(request.extensions.get('timeout') or self.timeout),
310 proxies=request.proxies or self.proxies,
311 allow_redirects=True,
312 stream=True
313 )
314
315 except requests.exceptions.TooManyRedirects as e:
316 max_redirects_exceeded = True
317 requests_res = e.response
318
319 except requests.exceptions.SSLError as e:
320 if 'CERTIFICATE_VERIFY_FAILED' in str(e):
321 raise CertificateVerifyError(cause=e) from e
322 raise SSLError(cause=e) from e
323
324 except requests.exceptions.ProxyError as e:
325 raise ProxyError(cause=e) from e
326
327 except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
328 raise TransportError(cause=e) from e
329
330 except urllib3.exceptions.HTTPError as e:
331 # Catch any urllib3 exceptions that may leak through
332 raise TransportError(cause=e) from e
333
334 except requests.exceptions.RequestException as e:
335 # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
336 raise RequestError(cause=e) from e
337
338 res = RequestsResponseAdapter(requests_res)
339
340 if not 200 <= res.status < 300:
341 raise HTTPError(res, redirect_loop=max_redirects_exceeded)
342
343 return res
344
345
346@register_preference(RequestsRH)
347def requests_preference(rh, request):
348 return 100
349
350
351# Use our socks proxy implementation with requests to avoid an extra dependency.
352class SocksHTTPConnection(urllib3.connection.HTTPConnection):
353 def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks
354 self._proxy_args = _socks_options
355 super().__init__(*args, **kwargs)
356
357 def _new_conn(self):
358 try:
359 return create_connection(
360 address=(self._proxy_args['addr'], self._proxy_args['port']),
361 timeout=self.timeout,
362 source_address=self.source_address,
363 _create_socket_func=functools.partial(
364 create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
365 except (socket.timeout, TimeoutError) as e:
366 raise urllib3.exceptions.ConnectTimeoutError(
367 self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
368 except SocksProxyError as e:
369 raise urllib3.exceptions.ProxyError(str(e), e) from e
370 except (OSError, socket.error) as e:
371 raise urllib3.exceptions.NewConnectionError(
372 self, f'Failed to establish a new connection: {e}') from e
373
374
375class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
376 pass
377
378
379class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
380 ConnectionCls = SocksHTTPConnection
381
382
383class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
384 ConnectionCls = SocksHTTPSConnection
385
386
387class SocksProxyManager(urllib3.PoolManager):
388
389 def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
390 connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
391 super().__init__(num_pools, headers, **connection_pool_kw)
392 self.pool_classes_by_scheme = {
393 'http': SocksHTTPConnectionPool,
394 'https': SocksHTTPSConnectionPool
395 }
396
397
398requests.adapters.SOCKSProxyManager = SocksProxyManager