]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/common.py
458eca39f8265d5dcc4d8cb1f8cb1c1d652749b0
[yt-dlp.git] / yt_dlp / networking / common.py
1 from __future__ import annotations
2
3 import abc
4 import copy
5 import enum
6 import functools
7 import io
8 import typing
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 from collections.abc import Iterable, Mapping
13 from email.message import Message
14 from http import HTTPStatus
15 from http.cookiejar import CookieJar
16
17 from ._helper import make_ssl_context, wrap_request_errors
18 from .exceptions import (
19 NoSupportingHandlers,
20 RequestError,
21 TransportError,
22 UnsupportedRequest,
23 )
24 from ..utils import (
25 bug_reports_message,
26 classproperty,
27 deprecation_warning,
28 error_to_str,
29 escape_url,
30 update_url_query,
31 )
32 from ..utils.networking import HTTPHeaderDict
33
34 if typing.TYPE_CHECKING:
35 RequestData = bytes | Iterable[bytes] | typing.IO | None
36
37
38 class RequestDirector:
39 """RequestDirector class
40
41 Helper class that, when given a request, forward it to a RequestHandler that supports it.
42
43 @param logger: Logger instance.
44 @param verbose: Print debug request information to stdout.
45 """
46
47 def __init__(self, logger, verbose=False):
48 self.handlers: dict[str, RequestHandler] = {}
49 self.logger = logger # TODO(Grub4k): default logger
50 self.verbose = verbose
51
52 def close(self):
53 for handler in self.handlers.values():
54 handler.close()
55
56 def add_handler(self, handler: RequestHandler):
57 """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
58 assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
59 self.handlers[handler.RH_KEY] = handler
60
61 def _print_verbose(self, msg):
62 if self.verbose:
63 self.logger.stdout(f'director: {msg}')
64
65 def send(self, request: Request) -> Response:
66 """
67 Passes a request onto a suitable RequestHandler
68 """
69 if not self.handlers:
70 raise RequestError('No request handlers configured')
71
72 assert isinstance(request, Request)
73
74 unexpected_errors = []
75 unsupported_errors = []
76 # TODO (future): add a per-request preference system
77 for handler in reversed(list(self.handlers.values())):
78 self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
79 try:
80 handler.validate(request)
81 except UnsupportedRequest as e:
82 self._print_verbose(
83 f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
84 unsupported_errors.append(e)
85 continue
86
87 self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
88 try:
89 response = handler.send(request)
90 except RequestError:
91 raise
92 except Exception as e:
93 self.logger.error(
94 f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
95 is_error=False)
96 unexpected_errors.append(e)
97 continue
98
99 assert isinstance(response, Response)
100 return response
101
102 raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
103
104
105 _REQUEST_HANDLERS = {}
106
107
108 def register(handler):
109 """Register a RequestHandler class"""
110 assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
111 assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
112 _REQUEST_HANDLERS[handler.RH_KEY] = handler
113 return handler
114
115
116 class Features(enum.Enum):
117 ALL_PROXY = enum.auto()
118 NO_PROXY = enum.auto()
119
120
121 class RequestHandler(abc.ABC):
122
123 """Request Handler class
124
125 Request handlers are class that, given a Request,
126 process the request from start to finish and return a Response.
127
128 Concrete subclasses need to redefine the _send(request) method,
129 which handles the underlying request logic and returns a Response.
130
131 RH_NAME class variable may contain a display name for the RequestHandler.
132 By default, this is generated from the class name.
133
134 The concrete request handler MUST have "RH" as the suffix in the class name.
135
136 All exceptions raised by a RequestHandler should be an instance of RequestError.
137 Any other exception raised will be treated as a handler issue.
138
139 If a Request is not supported by the handler, an UnsupportedRequest
140 should be raised with a reason.
141
142 By default, some checks are done on the request in _validate() based on the following class variables:
143 - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
144 Any Request with an url scheme not in this list will raise an UnsupportedRequest.
145
146 - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
147 a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
148
149 - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
150 The above may be set to None to disable the checks.
151
152 Parameters:
153 @param logger: logger instance
154 @param headers: HTTP Headers to include when sending requests.
155 @param cookiejar: Cookiejar to use for requests.
156 @param timeout: Socket timeout to use when sending requests.
157 @param proxies: Proxies to use for sending requests.
158 @param source_address: Client-side IP address to bind to for requests.
159 @param verbose: Print debug request and traffic information to stdout.
160 @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
161 @param client_cert: SSL client certificate configuration.
162 dict with {client_certificate, client_certificate_key, client_certificate_password}
163 @param verify: Verify SSL certificates
164 @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
165
166 Some configuration options may be available for individual Requests too. In this case,
167 either the Request configuration option takes precedence or they are merged.
168
169 Requests may have additional optional parameters defined as extensions.
170 RequestHandler subclasses may choose to support custom extensions.
171
172 The following extensions are defined for RequestHandler:
173 - `cookiejar`: Cookiejar to use for this request
174 - `timeout`: socket timeout to use for this request
175
176 Apart from the url protocol, proxies dict may contain the following keys:
177 - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
178 - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
179 Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
180
181 """
182
183 _SUPPORTED_URL_SCHEMES = ()
184 _SUPPORTED_PROXY_SCHEMES = ()
185 _SUPPORTED_FEATURES = ()
186
187 def __init__(
188 self, *,
189 logger, # TODO(Grub4k): default logger
190 headers: HTTPHeaderDict = None,
191 cookiejar: CookieJar = None,
192 timeout: float | int | None = None,
193 proxies: dict = None,
194 source_address: str = None,
195 verbose: bool = False,
196 prefer_system_certs: bool = False,
197 client_cert: dict[str, str | None] = None,
198 verify: bool = True,
199 legacy_ssl_support: bool = False,
200 **_,
201 ):
202
203 self._logger = logger
204 self.headers = headers or {}
205 self.cookiejar = cookiejar if cookiejar is not None else CookieJar()
206 self.timeout = float(timeout or 20)
207 self.proxies = proxies or {}
208 self.source_address = source_address
209 self.verbose = verbose
210 self.prefer_system_certs = prefer_system_certs
211 self._client_cert = client_cert or {}
212 self.verify = verify
213 self.legacy_ssl_support = legacy_ssl_support
214 super().__init__()
215
216 def _make_sslcontext(self):
217 return make_ssl_context(
218 verify=self.verify,
219 legacy_support=self.legacy_ssl_support,
220 use_certifi=not self.prefer_system_certs,
221 **self._client_cert,
222 )
223
224 def _merge_headers(self, request_headers):
225 return HTTPHeaderDict(self.headers, request_headers)
226
227 def _check_url_scheme(self, request: Request):
228 scheme = urllib.parse.urlparse(request.url).scheme.lower()
229 if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
230 raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
231 return scheme # for further processing
232
233 def _check_proxies(self, proxies):
234 for proxy_key, proxy_url in proxies.items():
235 if proxy_url is None:
236 continue
237 if proxy_key == 'no':
238 if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
239 raise UnsupportedRequest('"no" proxy is not supported')
240 continue
241 if (
242 proxy_key == 'all'
243 and self._SUPPORTED_FEATURES is not None
244 and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
245 ):
246 raise UnsupportedRequest('"all" proxy is not supported')
247
248 # Unlikely this handler will use this proxy, so ignore.
249 # This is to allow a case where a proxy may be set for a protocol
250 # for one handler in which such protocol (and proxy) is not supported by another handler.
251 if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
252 continue
253
254 if self._SUPPORTED_PROXY_SCHEMES is None:
255 # Skip proxy scheme checks
256 continue
257
258 # Scheme-less proxies are not supported
259 if urllib.request._parse_proxy(proxy_url)[0] is None:
260 raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
261
262 scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
263 if scheme not in self._SUPPORTED_PROXY_SCHEMES:
264 raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
265
266 def _check_cookiejar_extension(self, extensions):
267 if not extensions.get('cookiejar'):
268 return
269 if not isinstance(extensions['cookiejar'], CookieJar):
270 raise UnsupportedRequest('cookiejar is not a CookieJar')
271
272 def _check_timeout_extension(self, extensions):
273 if extensions.get('timeout') is None:
274 return
275 if not isinstance(extensions['timeout'], (float, int)):
276 raise UnsupportedRequest('timeout is not a float or int')
277
278 def _check_extensions(self, extensions):
279 self._check_cookiejar_extension(extensions)
280 self._check_timeout_extension(extensions)
281
282 def _validate(self, request):
283 self._check_url_scheme(request)
284 self._check_proxies(request.proxies or self.proxies)
285 self._check_extensions(request.extensions)
286
287 @wrap_request_errors
288 def validate(self, request: Request):
289 if not isinstance(request, Request):
290 raise TypeError('Expected an instance of Request')
291 self._validate(request)
292
293 @wrap_request_errors
294 def send(self, request: Request) -> Response:
295 if not isinstance(request, Request):
296 raise TypeError('Expected an instance of Request')
297 return self._send(request)
298
299 @abc.abstractmethod
300 def _send(self, request: Request):
301 """Handle a request from start to finish. Redefine in subclasses."""
302
303 def close(self):
304 pass
305
306 @classproperty
307 def RH_NAME(cls):
308 return cls.__name__[:-2]
309
310 @classproperty
311 def RH_KEY(cls):
312 assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
313 return cls.__name__[:-2]
314
315 def __enter__(self):
316 return self
317
318 def __exit__(self, *args):
319 self.close()
320
321
322 class Request:
323 """
324 Represents a request to be made.
325 Partially backwards-compatible with urllib.request.Request.
326
327 @param url: url to send. Will be sanitized.
328 @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
329 @param headers: headers to send.
330 @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
331 @param query: URL query parameters to update the url with.
332 @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
333 @param extensions: Dictionary of Request extensions to add, as supported by handlers.
334 """
335
336 def __init__(
337 self,
338 url: str,
339 data: RequestData = None,
340 headers: typing.Mapping = None,
341 proxies: dict = None,
342 query: dict = None,
343 method: str = None,
344 extensions: dict = None
345 ):
346
347 self._headers = HTTPHeaderDict()
348 self._data = None
349
350 if query:
351 url = update_url_query(url, query)
352
353 self.url = url
354 self.method = method
355 if headers:
356 self.headers = headers
357 self.data = data # note: must be done after setting headers
358 self.proxies = proxies or {}
359 self.extensions = extensions or {}
360
361 @property
362 def url(self):
363 return self._url
364
365 @url.setter
366 def url(self, url):
367 if not isinstance(url, str):
368 raise TypeError('url must be a string')
369 elif url.startswith('//'):
370 url = 'http:' + url
371 self._url = escape_url(url)
372
373 @property
374 def method(self):
375 return self._method or ('POST' if self.data is not None else 'GET')
376
377 @method.setter
378 def method(self, method):
379 if method is None:
380 self._method = None
381 elif isinstance(method, str):
382 self._method = method.upper()
383 else:
384 raise TypeError('method must be a string')
385
386 @property
387 def data(self):
388 return self._data
389
390 @data.setter
391 def data(self, data: RequestData):
392 # Try catch some common mistakes
393 if data is not None and (
394 not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
395 ):
396 raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
397
398 if data == self._data and self._data is None:
399 self.headers.pop('Content-Length', None)
400
401 # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
402 if data != self._data:
403 if self._data is not None:
404 self.headers.pop('Content-Length', None)
405 self._data = data
406
407 if self._data is None:
408 self.headers.pop('Content-Type', None)
409
410 if 'Content-Type' not in self.headers and self._data is not None:
411 self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
412
413 @property
414 def headers(self) -> HTTPHeaderDict:
415 return self._headers
416
417 @headers.setter
418 def headers(self, new_headers: Mapping):
419 """Replaces headers of the request. If not a CaseInsensitiveDict, it will be converted to one."""
420 if isinstance(new_headers, HTTPHeaderDict):
421 self._headers = new_headers
422 elif isinstance(new_headers, Mapping):
423 self._headers = HTTPHeaderDict(new_headers)
424 else:
425 raise TypeError('headers must be a mapping')
426
427 def update(self, url=None, data=None, headers=None, query=None):
428 self.data = data or self.data
429 self.headers.update(headers or {})
430 self.url = update_url_query(url or self.url, query or {})
431
432 def copy(self):
433 return self.__class__(
434 url=self.url,
435 headers=copy.deepcopy(self.headers),
436 proxies=copy.deepcopy(self.proxies),
437 data=self._data,
438 extensions=copy.copy(self.extensions),
439 method=self._method,
440 )
441
442
443 HEADRequest = functools.partial(Request, method='HEAD')
444 PUTRequest = functools.partial(Request, method='PUT')
445
446
447 class Response(io.IOBase):
448 """
449 Base class for HTTP response adapters.
450
451 By default, it provides a basic wrapper for a file-like response object.
452
453 Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
454
455 @param fp: Original, file-like, response.
456 @param url: URL that this is a response of.
457 @param headers: response headers.
458 @param status: Response HTTP status code. Default is 200 OK.
459 @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
460 """
461
462 def __init__(
463 self,
464 fp: typing.IO,
465 url: str,
466 headers: Mapping[str, str],
467 status: int = 200,
468 reason: str = None):
469
470 self.fp = fp
471 self.headers = Message()
472 for name, value in headers.items():
473 self.headers.add_header(name, value)
474 self.status = status
475 self.url = url
476 try:
477 self.reason = reason or HTTPStatus(status).phrase
478 except ValueError:
479 self.reason = None
480
481 def readable(self):
482 return self.fp.readable()
483
484 def read(self, amt: int = None) -> bytes:
485 # Expected errors raised here should be of type RequestError or subclasses.
486 # Subclasses should redefine this method with more precise error handling.
487 try:
488 return self.fp.read(amt)
489 except Exception as e:
490 raise TransportError(cause=e) from e
491
492 def close(self):
493 self.fp.close()
494 return super().close()
495
496 def get_header(self, name, default=None):
497 """Get header for name.
498 If there are multiple matching headers, return all seperated by comma."""
499 headers = self.headers.get_all(name)
500 if not headers:
501 return default
502 if name.title() == 'Set-Cookie':
503 # Special case, only get the first one
504 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
505 return headers[0]
506 return ', '.join(headers)
507
508 # The following methods are for compatability reasons and are deprecated
509 @property
510 def code(self):
511 deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2)
512 return self.status
513
514 def getcode(self):
515 deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2)
516 return self.status
517
518 def geturl(self):
519 deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2)
520 return self.url
521
522 def info(self):
523 deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2)
524 return self.headers
525
526 def getheader(self, name, default=None):
527 deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2)
528 return self.get_header(name, default)