]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/common.py
584c7bb4db4b6792f484c6b95376402cd548c21f
[yt-dlp.git] / yt_dlp / networking / common.py
1 from __future__ import annotations
2
3 import abc
4 import copy
5 import enum
6 import functools
7 import io
8 import typing
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 from collections.abc import Iterable, Mapping
13 from email.message import Message
14 from http import HTTPStatus
15
16 from ._helper import make_ssl_context, wrap_request_errors
17 from .exceptions import (
18 NoSupportingHandlers,
19 RequestError,
20 TransportError,
21 UnsupportedRequest,
22 )
23 from ..compat.types import NoneType
24 from ..cookies import YoutubeDLCookieJar
25 from ..utils import (
26 bug_reports_message,
27 classproperty,
28 deprecation_warning,
29 error_to_str,
30 update_url_query,
31 )
32 from ..utils.networking import HTTPHeaderDict, normalize_url
33
34
35 def register_preference(*handlers: type[RequestHandler]):
36 assert all(issubclass(handler, RequestHandler) for handler in handlers)
37
38 def outer(preference: Preference):
39 @functools.wraps(preference)
40 def inner(handler, *args, **kwargs):
41 if not handlers or isinstance(handler, handlers):
42 return preference(handler, *args, **kwargs)
43 return 0
44 _RH_PREFERENCES.add(inner)
45 return inner
46 return outer
47
48
49 class RequestDirector:
50 """RequestDirector class
51
52 Helper class that, when given a request, forward it to a RequestHandler that supports it.
53
54 Preference functions in the form of func(handler, request) -> int
55 can be registered into the `preferences` set. These are used to sort handlers
56 in order of preference.
57
58 @param logger: Logger instance.
59 @param verbose: Print debug request information to stdout.
60 """
61
62 def __init__(self, logger, verbose=False):
63 self.handlers: dict[str, RequestHandler] = {}
64 self.preferences: set[Preference] = set()
65 self.logger = logger # TODO(Grub4k): default logger
66 self.verbose = verbose
67
68 def close(self):
69 for handler in self.handlers.values():
70 handler.close()
71
72 def add_handler(self, handler: RequestHandler):
73 """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
74 assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
75 self.handlers[handler.RH_KEY] = handler
76
77 def _get_handlers(self, request: Request) -> list[RequestHandler]:
78 """Sorts handlers by preference, given a request"""
79 preferences = {
80 rh: sum(pref(rh, request) for pref in self.preferences)
81 for rh in self.handlers.values()
82 }
83 self._print_verbose('Handler preferences for this request: %s' % ', '.join(
84 f'{rh.RH_NAME}={pref}' for rh, pref in preferences.items()))
85 return sorted(self.handlers.values(), key=preferences.get, reverse=True)
86
87 def _print_verbose(self, msg):
88 if self.verbose:
89 self.logger.stdout(f'director: {msg}')
90
91 def send(self, request: Request) -> Response:
92 """
93 Passes a request onto a suitable RequestHandler
94 """
95 if not self.handlers:
96 raise RequestError('No request handlers configured')
97
98 assert isinstance(request, Request)
99
100 unexpected_errors = []
101 unsupported_errors = []
102 for handler in self._get_handlers(request):
103 self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
104 try:
105 handler.validate(request)
106 except UnsupportedRequest as e:
107 self._print_verbose(
108 f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
109 unsupported_errors.append(e)
110 continue
111
112 self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
113 try:
114 response = handler.send(request)
115 except RequestError:
116 raise
117 except Exception as e:
118 self.logger.error(
119 f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
120 is_error=False)
121 unexpected_errors.append(e)
122 continue
123
124 assert isinstance(response, Response)
125 return response
126
127 raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
128
129
130 _REQUEST_HANDLERS = {}
131
132
133 def register_rh(handler):
134 """Register a RequestHandler class"""
135 assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
136 assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
137 _REQUEST_HANDLERS[handler.RH_KEY] = handler
138 return handler
139
140
141 class Features(enum.Enum):
142 ALL_PROXY = enum.auto()
143 NO_PROXY = enum.auto()
144
145
146 class RequestHandler(abc.ABC):
147
148 """Request Handler class
149
150 Request handlers are class that, given a Request,
151 process the request from start to finish and return a Response.
152
153 Concrete subclasses need to redefine the _send(request) method,
154 which handles the underlying request logic and returns a Response.
155
156 RH_NAME class variable may contain a display name for the RequestHandler.
157 By default, this is generated from the class name.
158
159 The concrete request handler MUST have "RH" as the suffix in the class name.
160
161 All exceptions raised by a RequestHandler should be an instance of RequestError.
162 Any other exception raised will be treated as a handler issue.
163
164 If a Request is not supported by the handler, an UnsupportedRequest
165 should be raised with a reason.
166
167 By default, some checks are done on the request in _validate() based on the following class variables:
168 - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
169 Any Request with an url scheme not in this list will raise an UnsupportedRequest.
170
171 - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
172 a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
173
174 - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
175
176 The above may be set to None to disable the checks.
177
178 Parameters:
179 @param logger: logger instance
180 @param headers: HTTP Headers to include when sending requests.
181 @param cookiejar: Cookiejar to use for requests.
182 @param timeout: Socket timeout to use when sending requests.
183 @param proxies: Proxies to use for sending requests.
184 @param source_address: Client-side IP address to bind to for requests.
185 @param verbose: Print debug request and traffic information to stdout.
186 @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
187 @param client_cert: SSL client certificate configuration.
188 dict with {client_certificate, client_certificate_key, client_certificate_password}
189 @param verify: Verify SSL certificates
190 @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
191
192 Some configuration options may be available for individual Requests too. In this case,
193 either the Request configuration option takes precedence or they are merged.
194
195 Requests may have additional optional parameters defined as extensions.
196 RequestHandler subclasses may choose to support custom extensions.
197
198 If an extension is supported, subclasses should extend _check_extensions(extensions)
199 to pop and validate the extension.
200 - Extensions left in `extensions` are treated as unsupported and UnsupportedRequest will be raised.
201
202 The following extensions are defined for RequestHandler:
203 - `cookiejar`: Cookiejar to use for this request.
204 - `timeout`: socket timeout to use for this request.
205 To enable these, add extensions.pop('<extension>', None) to _check_extensions
206
207 Apart from the url protocol, proxies dict may contain the following keys:
208 - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
209 - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
210 Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
211
212 """
213
214 _SUPPORTED_URL_SCHEMES = ()
215 _SUPPORTED_PROXY_SCHEMES = ()
216 _SUPPORTED_FEATURES = ()
217
218 def __init__(
219 self, *,
220 logger, # TODO(Grub4k): default logger
221 headers: HTTPHeaderDict = None,
222 cookiejar: YoutubeDLCookieJar = None,
223 timeout: float | int | None = None,
224 proxies: dict = None,
225 source_address: str = None,
226 verbose: bool = False,
227 prefer_system_certs: bool = False,
228 client_cert: dict[str, str | None] = None,
229 verify: bool = True,
230 legacy_ssl_support: bool = False,
231 **_,
232 ):
233
234 self._logger = logger
235 self.headers = headers or {}
236 self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar()
237 self.timeout = float(timeout or 20)
238 self.proxies = proxies or {}
239 self.source_address = source_address
240 self.verbose = verbose
241 self.prefer_system_certs = prefer_system_certs
242 self._client_cert = client_cert or {}
243 self.verify = verify
244 self.legacy_ssl_support = legacy_ssl_support
245 super().__init__()
246
247 def _make_sslcontext(self):
248 return make_ssl_context(
249 verify=self.verify,
250 legacy_support=self.legacy_ssl_support,
251 use_certifi=not self.prefer_system_certs,
252 **self._client_cert,
253 )
254
255 def _merge_headers(self, request_headers):
256 return HTTPHeaderDict(self.headers, request_headers)
257
258 def _check_url_scheme(self, request: Request):
259 scheme = urllib.parse.urlparse(request.url).scheme.lower()
260 if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
261 raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
262 return scheme # for further processing
263
264 def _check_proxies(self, proxies):
265 for proxy_key, proxy_url in proxies.items():
266 if proxy_url is None:
267 continue
268 if proxy_key == 'no':
269 if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
270 raise UnsupportedRequest('"no" proxy is not supported')
271 continue
272 if (
273 proxy_key == 'all'
274 and self._SUPPORTED_FEATURES is not None
275 and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
276 ):
277 raise UnsupportedRequest('"all" proxy is not supported')
278
279 # Unlikely this handler will use this proxy, so ignore.
280 # This is to allow a case where a proxy may be set for a protocol
281 # for one handler in which such protocol (and proxy) is not supported by another handler.
282 if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
283 continue
284
285 if self._SUPPORTED_PROXY_SCHEMES is None:
286 # Skip proxy scheme checks
287 continue
288
289 try:
290 if urllib.request._parse_proxy(proxy_url)[0] is None:
291 # Scheme-less proxies are not supported
292 raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
293 except ValueError as e:
294 # parse_proxy may raise on some invalid proxy urls such as "/a/b/c"
295 raise UnsupportedRequest(f'Invalid proxy url "{proxy_url}": {e}')
296
297 scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
298 if scheme not in self._SUPPORTED_PROXY_SCHEMES:
299 raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
300
301 def _check_extensions(self, extensions):
302 """Check extensions for unsupported extensions. Subclasses should extend this."""
303 assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType))
304 assert isinstance(extensions.get('timeout'), (float, int, NoneType))
305
306 def _validate(self, request):
307 self._check_url_scheme(request)
308 self._check_proxies(request.proxies or self.proxies)
309 extensions = request.extensions.copy()
310 self._check_extensions(extensions)
311 if extensions:
312 # TODO: add support for optional extensions
313 raise UnsupportedRequest(f'Unsupported extensions: {", ".join(extensions.keys())}')
314
315 @wrap_request_errors
316 def validate(self, request: Request):
317 if not isinstance(request, Request):
318 raise TypeError('Expected an instance of Request')
319 self._validate(request)
320
321 @wrap_request_errors
322 def send(self, request: Request) -> Response:
323 if not isinstance(request, Request):
324 raise TypeError('Expected an instance of Request')
325 return self._send(request)
326
327 @abc.abstractmethod
328 def _send(self, request: Request):
329 """Handle a request from start to finish. Redefine in subclasses."""
330 pass
331
332 def close(self):
333 pass
334
335 @classproperty
336 def RH_NAME(cls):
337 return cls.__name__[:-2]
338
339 @classproperty
340 def RH_KEY(cls):
341 assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
342 return cls.__name__[:-2]
343
344 def __enter__(self):
345 return self
346
347 def __exit__(self, *args):
348 self.close()
349
350
351 class Request:
352 """
353 Represents a request to be made.
354 Partially backwards-compatible with urllib.request.Request.
355
356 @param url: url to send. Will be sanitized.
357 @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
358 @param headers: headers to send.
359 @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
360 @param query: URL query parameters to update the url with.
361 @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
362 @param extensions: Dictionary of Request extensions to add, as supported by handlers.
363 """
364
365 def __init__(
366 self,
367 url: str,
368 data: RequestData = None,
369 headers: typing.Mapping = None,
370 proxies: dict = None,
371 query: dict = None,
372 method: str = None,
373 extensions: dict = None
374 ):
375
376 self._headers = HTTPHeaderDict()
377 self._data = None
378
379 if query:
380 url = update_url_query(url, query)
381
382 self.url = url
383 self.method = method
384 if headers:
385 self.headers = headers
386 self.data = data # note: must be done after setting headers
387 self.proxies = proxies or {}
388 self.extensions = extensions or {}
389
390 @property
391 def url(self):
392 return self._url
393
394 @url.setter
395 def url(self, url):
396 if not isinstance(url, str):
397 raise TypeError('url must be a string')
398 elif url.startswith('//'):
399 url = 'http:' + url
400 self._url = normalize_url(url)
401
402 @property
403 def method(self):
404 return self._method or ('POST' if self.data is not None else 'GET')
405
406 @method.setter
407 def method(self, method):
408 if method is None:
409 self._method = None
410 elif isinstance(method, str):
411 self._method = method.upper()
412 else:
413 raise TypeError('method must be a string')
414
415 @property
416 def data(self):
417 return self._data
418
419 @data.setter
420 def data(self, data: RequestData):
421 # Try catch some common mistakes
422 if data is not None and (
423 not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
424 ):
425 raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
426
427 if data == self._data and self._data is None:
428 self.headers.pop('Content-Length', None)
429
430 # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
431 if data != self._data:
432 if self._data is not None:
433 self.headers.pop('Content-Length', None)
434 self._data = data
435
436 if self._data is None:
437 self.headers.pop('Content-Type', None)
438
439 if 'Content-Type' not in self.headers and self._data is not None:
440 self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
441
442 @property
443 def headers(self) -> HTTPHeaderDict:
444 return self._headers
445
446 @headers.setter
447 def headers(self, new_headers: Mapping):
448 """Replaces headers of the request. If not a CaseInsensitiveDict, it will be converted to one."""
449 if isinstance(new_headers, HTTPHeaderDict):
450 self._headers = new_headers
451 elif isinstance(new_headers, Mapping):
452 self._headers = HTTPHeaderDict(new_headers)
453 else:
454 raise TypeError('headers must be a mapping')
455
456 def update(self, url=None, data=None, headers=None, query=None):
457 self.data = data if data is not None else self.data
458 self.headers.update(headers or {})
459 self.url = update_url_query(url or self.url, query or {})
460
461 def copy(self):
462 return self.__class__(
463 url=self.url,
464 headers=copy.deepcopy(self.headers),
465 proxies=copy.deepcopy(self.proxies),
466 data=self._data,
467 extensions=copy.copy(self.extensions),
468 method=self._method,
469 )
470
471
472 HEADRequest = functools.partial(Request, method='HEAD')
473 PUTRequest = functools.partial(Request, method='PUT')
474
475
476 class Response(io.IOBase):
477 """
478 Base class for HTTP response adapters.
479
480 By default, it provides a basic wrapper for a file-like response object.
481
482 Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
483
484 @param fp: Original, file-like, response.
485 @param url: URL that this is a response of.
486 @param headers: response headers.
487 @param status: Response HTTP status code. Default is 200 OK.
488 @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
489 """
490
491 def __init__(
492 self,
493 fp: typing.IO,
494 url: str,
495 headers: Mapping[str, str],
496 status: int = 200,
497 reason: str = None):
498
499 self.fp = fp
500 self.headers = Message()
501 for name, value in headers.items():
502 self.headers.add_header(name, value)
503 self.status = status
504 self.url = url
505 try:
506 self.reason = reason or HTTPStatus(status).phrase
507 except ValueError:
508 self.reason = None
509
510 def readable(self):
511 return self.fp.readable()
512
513 def read(self, amt: int = None) -> bytes:
514 # Expected errors raised here should be of type RequestError or subclasses.
515 # Subclasses should redefine this method with more precise error handling.
516 try:
517 return self.fp.read(amt)
518 except Exception as e:
519 raise TransportError(cause=e) from e
520
521 def close(self):
522 self.fp.close()
523 return super().close()
524
525 def get_header(self, name, default=None):
526 """Get header for name.
527 If there are multiple matching headers, return all seperated by comma."""
528 headers = self.headers.get_all(name)
529 if not headers:
530 return default
531 if name.title() == 'Set-Cookie':
532 # Special case, only get the first one
533 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
534 return headers[0]
535 return ', '.join(headers)
536
537 # The following methods are for compatability reasons and are deprecated
538 @property
539 def code(self):
540 deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2)
541 return self.status
542
543 def getcode(self):
544 deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2)
545 return self.status
546
547 def geturl(self):
548 deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2)
549 return self.url
550
551 def info(self):
552 deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2)
553 return self.headers
554
555 def getheader(self, name, default=None):
556 deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2)
557 return self.get_header(name, default)
558
559
560 if typing.TYPE_CHECKING:
561 RequestData = bytes | Iterable[bytes] | typing.IO | None
562 Preference = typing.Callable[[RequestHandler, Request], int]
563
564 _RH_PREFERENCES: set[Preference] = set()