]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_requests.py
[ie/youtube] Extract upload timestamp if available (#9856)
[yt-dlp.git] / yt_dlp / networking / _requests.py
1 import contextlib
2 import functools
3 import http.client
4 import logging
5 import re
6 import socket
7 import warnings
8
9 from ..dependencies import brotli, requests, urllib3
10 from ..utils import bug_reports_message, int_or_none, variadic
11 from ..utils.networking import normalize_url
12
13 if requests is None:
14 raise ImportError('requests module is not installed')
15
16 if urllib3 is None:
17 raise ImportError('urllib3 module is not installed')
18
19 urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
20
21 if urllib3_version < (1, 26, 17):
22 raise ImportError('Only urllib3 >= 1.26.17 is supported')
23
24 if requests.__build__ < 0x023100:
25 raise ImportError('Only requests >= 2.31.0 is supported')
26
27 import requests.adapters
28 import requests.utils
29 import urllib3.connection
30 import urllib3.exceptions
31 import urllib3.util
32
33 from ._helper import (
34 InstanceStoreMixin,
35 add_accept_encoding_header,
36 create_connection,
37 create_socks_proxy_socket,
38 get_redirect_method,
39 make_socks_proxy_opts,
40 select_proxy,
41 )
42 from .common import (
43 Features,
44 RequestHandler,
45 Response,
46 register_preference,
47 register_rh,
48 )
49 from .exceptions import (
50 CertificateVerifyError,
51 HTTPError,
52 IncompleteRead,
53 ProxyError,
54 RequestError,
55 SSLError,
56 TransportError,
57 )
58 from ..socks import ProxyError as SocksProxyError
59
60 SUPPORTED_ENCODINGS = [
61 'gzip', 'deflate'
62 ]
63
64 if brotli is not None:
65 SUPPORTED_ENCODINGS.append('br')
66
67 """
68 Override urllib3's behavior to not convert lower-case percent-encoded characters
69 to upper-case during url normalization process.
70
71 RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
72 and normalizers should convert them to uppercase for consistency [1].
73
74 However, some sites may have an incorrect implementation where they provide
75 a percent-encoded url that is then compared case-sensitively.[2]
76
77 While this is a very rare case, since urllib does not do this normalization step, it
78 is best to avoid it in requests too for compatability reasons.
79
80 1: https://tools.ietf.org/html/rfc3986#section-2.1
81 2: https://github.com/streamlink/streamlink/pull/4003
82 """
83
84
85 class Urllib3PercentREOverride:
86 def __init__(self, r: re.Pattern):
87 self.re = r
88
89 # pass through all other attribute calls to the original re
90 def __getattr__(self, item):
91 return self.re.__getattribute__(item)
92
93 def subn(self, repl, string, *args, **kwargs):
94 return string, self.re.subn(repl, string, *args, **kwargs)[1]
95
96
97 # urllib3 >= 1.25.8 uses subn:
98 # https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
99 import urllib3.util.url # noqa: E305
100
101 if hasattr(urllib3.util.url, 'PERCENT_RE'):
102 urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
103 elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0
104 urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
105 else:
106 warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
107
108 """
109 Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
110 server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
111 however this is an issue because we set check_hostname to True in our SSLContext.
112
113 Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
114
115 This has been fixed in urllib3 2.0+.
116 See: https://github.com/urllib3/urllib3/issues/517
117 """
118
119 if urllib3_version < (2, 0, 0):
120 with contextlib.suppress(Exception):
121 urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
122
123
124 # Requests will not automatically handle no_proxy by default
125 # due to buggy no_proxy handling with proxy dict [1].
126 # 1. https://github.com/psf/requests/issues/5000
127 requests.adapters.select_proxy = select_proxy
128
129
130 class RequestsResponseAdapter(Response):
131 def __init__(self, res: requests.models.Response):
132 super().__init__(
133 fp=res.raw, headers=res.headers, url=res.url,
134 status=res.status_code, reason=res.reason)
135
136 self._requests_response = res
137
138 def read(self, amt: int = None):
139 try:
140 # Interact with urllib3 response directly.
141 return self.fp.read(amt, decode_content=True)
142
143 # See urllib3.response.HTTPResponse.read() for exceptions raised on read
144 except urllib3.exceptions.SSLError as e:
145 raise SSLError(cause=e) from e
146
147 except urllib3.exceptions.ProtocolError as e:
148 # IncompleteRead is always contained within ProtocolError
149 # See urllib3.response.HTTPResponse._error_catcher()
150 ir_err = next(
151 (err for err in (e.__context__, e.__cause__, *variadic(e.args))
152 if isinstance(err, http.client.IncompleteRead)), None)
153 if ir_err is not None:
154 # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
155 # but uses an `int` for its `partial` property.
156 partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
157 raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
158 raise TransportError(cause=e) from e
159
160 except urllib3.exceptions.HTTPError as e:
161 # catch-all for any other urllib3 response exceptions
162 raise TransportError(cause=e) from e
163
164
165 class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
166 def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
167 self._pm_args = {}
168 if ssl_context:
169 self._pm_args['ssl_context'] = ssl_context
170 if source_address:
171 self._pm_args['source_address'] = (source_address, 0)
172 self._proxy_ssl_context = proxy_ssl_context or ssl_context
173 super().__init__(**kwargs)
174
175 def init_poolmanager(self, *args, **kwargs):
176 return super().init_poolmanager(*args, **kwargs, **self._pm_args)
177
178 def proxy_manager_for(self, proxy, **proxy_kwargs):
179 extra_kwargs = {}
180 if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
181 extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
182 return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
183
184 # Skip `requests` internal verification; we use our own SSLContext
185 # requests 2.31.0+
186 def cert_verify(*args, **kwargs):
187 pass
188
189 # requests 2.31.0-2.32.1
190 def _get_connection(self, request, *_, proxies=None, **__):
191 return self.get_connection(request.url, proxies)
192
193 # requests 2.32.2+: Reimplementation without `_urllib3_request_context`
194 def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None):
195 url = urllib3.util.parse_url(request.url).url
196
197 manager = self.poolmanager
198 if proxy := select_proxy(url, proxies):
199 manager = self.proxy_manager_for(proxy)
200
201 return manager.connection_from_url(url)
202
203
204 class RequestsSession(requests.sessions.Session):
205 """
206 Ensure unified redirect method handling with our urllib redirect handler.
207 """
208
209 def rebuild_method(self, prepared_request, response):
210 new_method = get_redirect_method(prepared_request.method, response.status_code)
211
212 # HACK: requests removes headers/body on redirect unless code was a 307/308.
213 if new_method == prepared_request.method:
214 response._real_status_code = response.status_code
215 response.status_code = 308
216
217 prepared_request.method = new_method
218
219 # Requests fails to resolve dot segments on absolute redirect locations
220 # See: https://github.com/yt-dlp/yt-dlp/issues/9020
221 prepared_request.url = normalize_url(prepared_request.url)
222
223 def rebuild_auth(self, prepared_request, response):
224 # HACK: undo status code change from rebuild_method, if applicable.
225 # rebuild_auth runs after requests would remove headers/body based on status code
226 if hasattr(response, '_real_status_code'):
227 response.status_code = response._real_status_code
228 del response._real_status_code
229 return super().rebuild_auth(prepared_request, response)
230
231
232 class Urllib3LoggingFilter(logging.Filter):
233
234 def filter(self, record):
235 # Ignore HTTP request messages since HTTPConnection prints those
236 if record.msg == '%s://%s:%s "%s %s %s" %s %s':
237 return False
238 return True
239
240
241 class Urllib3LoggingHandler(logging.Handler):
242 """Redirect urllib3 logs to our logger"""
243
244 def __init__(self, logger, *args, **kwargs):
245 super().__init__(*args, **kwargs)
246 self._logger = logger
247
248 def emit(self, record):
249 try:
250 msg = self.format(record)
251 if record.levelno >= logging.ERROR:
252 self._logger.error(msg)
253 else:
254 self._logger.stdout(msg)
255
256 except Exception:
257 self.handleError(record)
258
259
260 @register_rh
261 class RequestsRH(RequestHandler, InstanceStoreMixin):
262
263 """Requests RequestHandler
264 https://github.com/psf/requests
265 """
266 _SUPPORTED_URL_SCHEMES = ('http', 'https')
267 _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
268 _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
269 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
270 RH_NAME = 'requests'
271
272 def __init__(self, *args, **kwargs):
273 super().__init__(*args, **kwargs)
274
275 # Forward urllib3 debug messages to our logger
276 logger = logging.getLogger('urllib3')
277 self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
278 self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
279 self.__logging_handler.addFilter(Urllib3LoggingFilter())
280 logger.addHandler(self.__logging_handler)
281 # TODO: Use a logger filter to suppress pool reuse warning instead
282 logger.setLevel(logging.ERROR)
283
284 if self.verbose:
285 # Setting this globally is not ideal, but is easier than hacking with urllib3.
286 # It could technically be problematic for scripts embedding yt-dlp.
287 # However, it is unlikely debug traffic is used in that context in a way this will cause problems.
288 urllib3.connection.HTTPConnection.debuglevel = 1
289 logger.setLevel(logging.DEBUG)
290 # this is expected if we are using --no-check-certificate
291 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
292
293 def close(self):
294 self._clear_instances()
295 # Remove the logging handler that contains a reference to our logger
296 # See: https://github.com/yt-dlp/yt-dlp/issues/8922
297 logging.getLogger('urllib3').removeHandler(self.__logging_handler)
298
299 def _check_extensions(self, extensions):
300 super()._check_extensions(extensions)
301 extensions.pop('cookiejar', None)
302 extensions.pop('timeout', None)
303
304 def _create_instance(self, cookiejar):
305 session = RequestsSession()
306 http_adapter = RequestsHTTPAdapter(
307 ssl_context=self._make_sslcontext(),
308 source_address=self.source_address,
309 max_retries=urllib3.util.retry.Retry(False),
310 )
311 session.adapters.clear()
312 session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
313 session.mount('https://', http_adapter)
314 session.mount('http://', http_adapter)
315 session.cookies = cookiejar
316 session.trust_env = False # no need, we already load proxies from env
317 return session
318
319 def _send(self, request):
320
321 headers = self._merge_headers(request.headers)
322 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
323
324 max_redirects_exceeded = False
325
326 session = self._get_instance(cookiejar=self._get_cookiejar(request))
327
328 try:
329 requests_res = session.request(
330 method=request.method,
331 url=request.url,
332 data=request.data,
333 headers=headers,
334 timeout=self._calculate_timeout(request),
335 proxies=self._get_proxies(request),
336 allow_redirects=True,
337 stream=True
338 )
339
340 except requests.exceptions.TooManyRedirects as e:
341 max_redirects_exceeded = True
342 requests_res = e.response
343
344 except requests.exceptions.SSLError as e:
345 if 'CERTIFICATE_VERIFY_FAILED' in str(e):
346 raise CertificateVerifyError(cause=e) from e
347 raise SSLError(cause=e) from e
348
349 except requests.exceptions.ProxyError as e:
350 raise ProxyError(cause=e) from e
351
352 except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
353 raise TransportError(cause=e) from e
354
355 except urllib3.exceptions.HTTPError as e:
356 # Catch any urllib3 exceptions that may leak through
357 raise TransportError(cause=e) from e
358
359 except requests.exceptions.RequestException as e:
360 # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
361 raise RequestError(cause=e) from e
362
363 res = RequestsResponseAdapter(requests_res)
364
365 if not 200 <= res.status < 300:
366 raise HTTPError(res, redirect_loop=max_redirects_exceeded)
367
368 return res
369
370
371 @register_preference(RequestsRH)
372 def requests_preference(rh, request):
373 return 100
374
375
376 # Use our socks proxy implementation with requests to avoid an extra dependency.
377 class SocksHTTPConnection(urllib3.connection.HTTPConnection):
378 def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks
379 self._proxy_args = _socks_options
380 super().__init__(*args, **kwargs)
381
382 def _new_conn(self):
383 try:
384 return create_connection(
385 address=(self._proxy_args['addr'], self._proxy_args['port']),
386 timeout=self.timeout,
387 source_address=self.source_address,
388 _create_socket_func=functools.partial(
389 create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
390 except (socket.timeout, TimeoutError) as e:
391 raise urllib3.exceptions.ConnectTimeoutError(
392 self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
393 except SocksProxyError as e:
394 raise urllib3.exceptions.ProxyError(str(e), e) from e
395 except OSError as e:
396 raise urllib3.exceptions.NewConnectionError(
397 self, f'Failed to establish a new connection: {e}') from e
398
399
400 class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
401 pass
402
403
404 class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
405 ConnectionCls = SocksHTTPConnection
406
407
408 class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
409 ConnectionCls = SocksHTTPSConnection
410
411
412 class SocksProxyManager(urllib3.PoolManager):
413
414 def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
415 connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
416 super().__init__(num_pools, headers, **connection_pool_kw)
417 self.pool_classes_by_scheme = {
418 'http': SocksHTTPConnectionPool,
419 'https': SocksHTTPSConnectionPool
420 }
421
422
423 requests.adapters.SOCKSProxyManager = SocksProxyManager