]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/common.py
[networking] Rewrite architecture (#2861)
[yt-dlp.git] / yt_dlp / networking / common.py
1 from __future__ import annotations
2
3 import abc
4 import copy
5 import enum
6 import functools
7 import io
8 import typing
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 from collections.abc import Iterable, Mapping
13 from email.message import Message
14 from http import HTTPStatus
15 from http.cookiejar import CookieJar
16
17 from ._helper import make_ssl_context, wrap_request_errors
18 from .exceptions import (
19 NoSupportingHandlers,
20 RequestError,
21 TransportError,
22 UnsupportedRequest,
23 )
24 from ..utils import (
25 bug_reports_message,
26 classproperty,
27 error_to_str,
28 escape_url,
29 update_url_query,
30 )
31 from ..utils.networking import HTTPHeaderDict
32
33 if typing.TYPE_CHECKING:
34 RequestData = bytes | Iterable[bytes] | typing.IO | None
35
36
37 class RequestDirector:
38 """RequestDirector class
39
40 Helper class that, when given a request, forward it to a RequestHandler that supports it.
41
42 @param logger: Logger instance.
43 @param verbose: Print debug request information to stdout.
44 """
45
46 def __init__(self, logger, verbose=False):
47 self.handlers: dict[str, RequestHandler] = {}
48 self.logger = logger # TODO(Grub4k): default logger
49 self.verbose = verbose
50
51 def close(self):
52 for handler in self.handlers.values():
53 handler.close()
54
55 def add_handler(self, handler: RequestHandler):
56 """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
57 assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
58 self.handlers[handler.RH_KEY] = handler
59
60 def _print_verbose(self, msg):
61 if self.verbose:
62 self.logger.stdout(f'director: {msg}')
63
64 def send(self, request: Request) -> Response:
65 """
66 Passes a request onto a suitable RequestHandler
67 """
68 if not self.handlers:
69 raise RequestError('No request handlers configured')
70
71 assert isinstance(request, Request)
72
73 unexpected_errors = []
74 unsupported_errors = []
75 # TODO (future): add a per-request preference system
76 for handler in reversed(list(self.handlers.values())):
77 self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
78 try:
79 handler.validate(request)
80 except UnsupportedRequest as e:
81 self._print_verbose(
82 f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
83 unsupported_errors.append(e)
84 continue
85
86 self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
87 try:
88 response = handler.send(request)
89 except RequestError:
90 raise
91 except Exception as e:
92 self.logger.error(
93 f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
94 is_error=False)
95 unexpected_errors.append(e)
96 continue
97
98 assert isinstance(response, Response)
99 return response
100
101 raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
102
103
104 _REQUEST_HANDLERS = {}
105
106
107 def register(handler):
108 """Register a RequestHandler class"""
109 assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
110 assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
111 _REQUEST_HANDLERS[handler.RH_KEY] = handler
112 return handler
113
114
115 class Features(enum.Enum):
116 ALL_PROXY = enum.auto()
117 NO_PROXY = enum.auto()
118
119
120 class RequestHandler(abc.ABC):
121
122 """Request Handler class
123
124 Request handlers are class that, given a Request,
125 process the request from start to finish and return a Response.
126
127 Concrete subclasses need to redefine the _send(request) method,
128 which handles the underlying request logic and returns a Response.
129
130 RH_NAME class variable may contain a display name for the RequestHandler.
131 By default, this is generated from the class name.
132
133 The concrete request handler MUST have "RH" as the suffix in the class name.
134
135 All exceptions raised by a RequestHandler should be an instance of RequestError.
136 Any other exception raised will be treated as a handler issue.
137
138 If a Request is not supported by the handler, an UnsupportedRequest
139 should be raised with a reason.
140
141 By default, some checks are done on the request in _validate() based on the following class variables:
142 - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
143 Any Request with an url scheme not in this list will raise an UnsupportedRequest.
144
145 - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
146 a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
147
148 - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
149 The above may be set to None to disable the checks.
150
151 Parameters:
152 @param logger: logger instance
153 @param headers: HTTP Headers to include when sending requests.
154 @param cookiejar: Cookiejar to use for requests.
155 @param timeout: Socket timeout to use when sending requests.
156 @param proxies: Proxies to use for sending requests.
157 @param source_address: Client-side IP address to bind to for requests.
158 @param verbose: Print debug request and traffic information to stdout.
159 @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
160 @param client_cert: SSL client certificate configuration.
161 dict with {client_certificate, client_certificate_key, client_certificate_password}
162 @param verify: Verify SSL certificates
163 @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
164
165 Some configuration options may be available for individual Requests too. In this case,
166 either the Request configuration option takes precedence or they are merged.
167
168 Requests may have additional optional parameters defined as extensions.
169 RequestHandler subclasses may choose to support custom extensions.
170
171 The following extensions are defined for RequestHandler:
172 - `cookiejar`: Cookiejar to use for this request
173 - `timeout`: socket timeout to use for this request
174
175 Apart from the url protocol, proxies dict may contain the following keys:
176 - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
177 - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
178 Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
179
180 """
181
182 _SUPPORTED_URL_SCHEMES = ()
183 _SUPPORTED_PROXY_SCHEMES = ()
184 _SUPPORTED_FEATURES = ()
185
186 def __init__(
187 self, *,
188 logger, # TODO(Grub4k): default logger
189 headers: HTTPHeaderDict = None,
190 cookiejar: CookieJar = None,
191 timeout: float | int | None = None,
192 proxies: dict = None,
193 source_address: str = None,
194 verbose: bool = False,
195 prefer_system_certs: bool = False,
196 client_cert: dict[str, str | None] = None,
197 verify: bool = True,
198 legacy_ssl_support: bool = False,
199 **_,
200 ):
201
202 self._logger = logger
203 self.headers = headers or {}
204 self.cookiejar = cookiejar if cookiejar is not None else CookieJar()
205 self.timeout = float(timeout or 20)
206 self.proxies = proxies or {}
207 self.source_address = source_address
208 self.verbose = verbose
209 self.prefer_system_certs = prefer_system_certs
210 self._client_cert = client_cert or {}
211 self.verify = verify
212 self.legacy_ssl_support = legacy_ssl_support
213 super().__init__()
214
215 def _make_sslcontext(self):
216 return make_ssl_context(
217 verify=self.verify,
218 legacy_support=self.legacy_ssl_support,
219 use_certifi=not self.prefer_system_certs,
220 **self._client_cert,
221 )
222
223 def _merge_headers(self, request_headers):
224 return HTTPHeaderDict(self.headers, request_headers)
225
226 def _check_url_scheme(self, request: Request):
227 scheme = urllib.parse.urlparse(request.url).scheme.lower()
228 if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
229 raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
230 return scheme # for further processing
231
232 def _check_proxies(self, proxies):
233 for proxy_key, proxy_url in proxies.items():
234 if proxy_url is None:
235 continue
236 if proxy_key == 'no':
237 if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
238 raise UnsupportedRequest('"no" proxy is not supported')
239 continue
240 if (
241 proxy_key == 'all'
242 and self._SUPPORTED_FEATURES is not None
243 and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
244 ):
245 raise UnsupportedRequest('"all" proxy is not supported')
246
247 # Unlikely this handler will use this proxy, so ignore.
248 # This is to allow a case where a proxy may be set for a protocol
249 # for one handler in which such protocol (and proxy) is not supported by another handler.
250 if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
251 continue
252
253 if self._SUPPORTED_PROXY_SCHEMES is None:
254 # Skip proxy scheme checks
255 continue
256
257 # Scheme-less proxies are not supported
258 if urllib.request._parse_proxy(proxy_url)[0] is None:
259 raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
260
261 scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
262 if scheme not in self._SUPPORTED_PROXY_SCHEMES:
263 raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
264
265 def _check_cookiejar_extension(self, extensions):
266 if not extensions.get('cookiejar'):
267 return
268 if not isinstance(extensions['cookiejar'], CookieJar):
269 raise UnsupportedRequest('cookiejar is not a CookieJar')
270
271 def _check_timeout_extension(self, extensions):
272 if extensions.get('timeout') is None:
273 return
274 if not isinstance(extensions['timeout'], (float, int)):
275 raise UnsupportedRequest('timeout is not a float or int')
276
277 def _check_extensions(self, extensions):
278 self._check_cookiejar_extension(extensions)
279 self._check_timeout_extension(extensions)
280
281 def _validate(self, request):
282 self._check_url_scheme(request)
283 self._check_proxies(request.proxies or self.proxies)
284 self._check_extensions(request.extensions)
285
286 @wrap_request_errors
287 def validate(self, request: Request):
288 if not isinstance(request, Request):
289 raise TypeError('Expected an instance of Request')
290 self._validate(request)
291
292 @wrap_request_errors
293 def send(self, request: Request) -> Response:
294 if not isinstance(request, Request):
295 raise TypeError('Expected an instance of Request')
296 return self._send(request)
297
298 @abc.abstractmethod
299 def _send(self, request: Request):
300 """Handle a request from start to finish. Redefine in subclasses."""
301
302 def close(self):
303 pass
304
305 @classproperty
306 def RH_NAME(cls):
307 return cls.__name__[:-2]
308
309 @classproperty
310 def RH_KEY(cls):
311 assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
312 return cls.__name__[:-2]
313
314 def __enter__(self):
315 return self
316
317 def __exit__(self, *args):
318 self.close()
319
320
321 class Request:
322 """
323 Represents a request to be made.
324 Partially backwards-compatible with urllib.request.Request.
325
326 @param url: url to send. Will be sanitized.
327 @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
328 @param headers: headers to send.
329 @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
330 @param query: URL query parameters to update the url with.
331 @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
332 @param extensions: Dictionary of Request extensions to add, as supported by handlers.
333 """
334
335 def __init__(
336 self,
337 url: str,
338 data: RequestData = None,
339 headers: typing.Mapping = None,
340 proxies: dict = None,
341 query: dict = None,
342 method: str = None,
343 extensions: dict = None
344 ):
345
346 self._headers = HTTPHeaderDict()
347 self._data = None
348
349 if query:
350 url = update_url_query(url, query)
351
352 self.url = url
353 self.method = method
354 if headers:
355 self.headers = headers
356 self.data = data # note: must be done after setting headers
357 self.proxies = proxies or {}
358 self.extensions = extensions or {}
359
360 @property
361 def url(self):
362 return self._url
363
364 @url.setter
365 def url(self, url):
366 if not isinstance(url, str):
367 raise TypeError('url must be a string')
368 elif url.startswith('//'):
369 url = 'http:' + url
370 self._url = escape_url(url)
371
372 @property
373 def method(self):
374 return self._method or ('POST' if self.data is not None else 'GET')
375
376 @method.setter
377 def method(self, method):
378 if method is None:
379 self._method = None
380 elif isinstance(method, str):
381 self._method = method.upper()
382 else:
383 raise TypeError('method must be a string')
384
385 @property
386 def data(self):
387 return self._data
388
389 @data.setter
390 def data(self, data: RequestData):
391 # Try catch some common mistakes
392 if data is not None and (
393 not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
394 ):
395 raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
396
397 if data == self._data and self._data is None:
398 self.headers.pop('Content-Length', None)
399
400 # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
401 if data != self._data:
402 if self._data is not None:
403 self.headers.pop('Content-Length', None)
404 self._data = data
405
406 if self._data is None:
407 self.headers.pop('Content-Type', None)
408
409 if 'Content-Type' not in self.headers and self._data is not None:
410 self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
411
412 @property
413 def headers(self) -> HTTPHeaderDict:
414 return self._headers
415
416 @headers.setter
417 def headers(self, new_headers: Mapping):
418 """Replaces headers of the request. If not a CaseInsensitiveDict, it will be converted to one."""
419 if isinstance(new_headers, HTTPHeaderDict):
420 self._headers = new_headers
421 elif isinstance(new_headers, Mapping):
422 self._headers = HTTPHeaderDict(new_headers)
423 else:
424 raise TypeError('headers must be a mapping')
425
426 def update(self, url=None, data=None, headers=None, query=None):
427 self.data = data or self.data
428 self.headers.update(headers or {})
429 self.url = update_url_query(url or self.url, query or {})
430
431 def copy(self):
432 return self.__class__(
433 url=self.url,
434 headers=copy.deepcopy(self.headers),
435 proxies=copy.deepcopy(self.proxies),
436 data=self._data,
437 extensions=copy.copy(self.extensions),
438 method=self._method,
439 )
440
441
442 HEADRequest = functools.partial(Request, method='HEAD')
443 PUTRequest = functools.partial(Request, method='PUT')
444
445
446 class Response(io.IOBase):
447 """
448 Base class for HTTP response adapters.
449
450 By default, it provides a basic wrapper for a file-like response object.
451
452 Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
453
454 @param fp: Original, file-like, response.
455 @param url: URL that this is a response of.
456 @param headers: response headers.
457 @param status: Response HTTP status code. Default is 200 OK.
458 @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
459 """
460
461 def __init__(
462 self,
463 fp: typing.IO,
464 url: str,
465 headers: Mapping[str, str],
466 status: int = 200,
467 reason: str = None):
468
469 self.fp = fp
470 self.headers = Message()
471 for name, value in headers.items():
472 self.headers.add_header(name, value)
473 self.status = status
474 self.url = url
475 try:
476 self.reason = reason or HTTPStatus(status).phrase
477 except ValueError:
478 self.reason = None
479
480 def readable(self):
481 return self.fp.readable()
482
483 def read(self, amt: int = None) -> bytes:
484 # Expected errors raised here should be of type RequestError or subclasses.
485 # Subclasses should redefine this method with more precise error handling.
486 try:
487 return self.fp.read(amt)
488 except Exception as e:
489 raise TransportError(cause=e) from e
490
491 def close(self):
492 self.fp.close()
493 return super().close()
494
495 def get_header(self, name, default=None):
496 """Get header for name.
497 If there are multiple matching headers, return all seperated by comma."""
498 headers = self.headers.get_all(name)
499 if not headers:
500 return default
501 if name.title() == 'Set-Cookie':
502 # Special case, only get the first one
503 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
504 return headers[0]
505 return ', '.join(headers)
506
507 # The following methods are for compatability reasons and are deprecated
508 @property
509 def code(self):
510 return self.status
511
512 def getcode(self):
513 return self.status
514
515 def geturl(self):
516 return self.url
517
518 def info(self):
519 return self.headers
520
521 def getheader(self, name, default=None):
522 return self.get_header(name, default)