]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/common.py
4c66ba66aaf3ee5e7b884be41c81ada4d1d27d52
[yt-dlp.git] / yt_dlp / networking / common.py
1 from __future__ import annotations
2
3 import abc
4 import copy
5 import enum
6 import functools
7 import io
8 import typing
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 from collections.abc import Iterable, Mapping
13 from email.message import Message
14 from http import HTTPStatus
15
16 from ._helper import make_ssl_context, wrap_request_errors
17 from .exceptions import (
18 NoSupportingHandlers,
19 RequestError,
20 TransportError,
21 UnsupportedRequest,
22 )
23 from ..compat.types import NoneType
24 from ..cookies import YoutubeDLCookieJar
25 from ..utils import (
26 bug_reports_message,
27 classproperty,
28 deprecation_warning,
29 error_to_str,
30 update_url_query,
31 )
32 from ..utils.networking import HTTPHeaderDict, normalize_url
33
34
35 def register_preference(*handlers: type[RequestHandler]):
36 assert all(issubclass(handler, RequestHandler) for handler in handlers)
37
38 def outer(preference: Preference):
39 @functools.wraps(preference)
40 def inner(handler, *args, **kwargs):
41 if not handlers or isinstance(handler, handlers):
42 return preference(handler, *args, **kwargs)
43 return 0
44 _RH_PREFERENCES.add(inner)
45 return inner
46 return outer
47
48
49 class RequestDirector:
50 """RequestDirector class
51
52 Helper class that, when given a request, forward it to a RequestHandler that supports it.
53
54 Preference functions in the form of func(handler, request) -> int
55 can be registered into the `preferences` set. These are used to sort handlers
56 in order of preference.
57
58 @param logger: Logger instance.
59 @param verbose: Print debug request information to stdout.
60 """
61
62 def __init__(self, logger, verbose=False):
63 self.handlers: dict[str, RequestHandler] = {}
64 self.preferences: set[Preference] = set()
65 self.logger = logger # TODO(Grub4k): default logger
66 self.verbose = verbose
67
68 def close(self):
69 for handler in self.handlers.values():
70 handler.close()
71 self.handlers.clear()
72
73 def add_handler(self, handler: RequestHandler):
74 """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
75 assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
76 self.handlers[handler.RH_KEY] = handler
77
78 def _get_handlers(self, request: Request) -> list[RequestHandler]:
79 """Sorts handlers by preference, given a request"""
80 preferences = {
81 rh: sum(pref(rh, request) for pref in self.preferences)
82 for rh in self.handlers.values()
83 }
84 self._print_verbose('Handler preferences for this request: %s' % ', '.join(
85 f'{rh.RH_NAME}={pref}' for rh, pref in preferences.items()))
86 return sorted(self.handlers.values(), key=preferences.get, reverse=True)
87
88 def _print_verbose(self, msg):
89 if self.verbose:
90 self.logger.stdout(f'director: {msg}')
91
92 def send(self, request: Request) -> Response:
93 """
94 Passes a request onto a suitable RequestHandler
95 """
96 if not self.handlers:
97 raise RequestError('No request handlers configured')
98
99 assert isinstance(request, Request)
100
101 unexpected_errors = []
102 unsupported_errors = []
103 for handler in self._get_handlers(request):
104 self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
105 try:
106 handler.validate(request)
107 except UnsupportedRequest as e:
108 self._print_verbose(
109 f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
110 unsupported_errors.append(e)
111 continue
112
113 self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
114 try:
115 response = handler.send(request)
116 except RequestError:
117 raise
118 except Exception as e:
119 self.logger.error(
120 f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
121 is_error=False)
122 unexpected_errors.append(e)
123 continue
124
125 assert isinstance(response, Response)
126 return response
127
128 raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
129
130
131 _REQUEST_HANDLERS = {}
132
133
134 def register_rh(handler):
135 """Register a RequestHandler class"""
136 assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
137 assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
138 _REQUEST_HANDLERS[handler.RH_KEY] = handler
139 return handler
140
141
142 class Features(enum.Enum):
143 ALL_PROXY = enum.auto()
144 NO_PROXY = enum.auto()
145
146
147 class RequestHandler(abc.ABC):
148
149 """Request Handler class
150
151 Request handlers are class that, given a Request,
152 process the request from start to finish and return a Response.
153
154 Concrete subclasses need to redefine the _send(request) method,
155 which handles the underlying request logic and returns a Response.
156
157 RH_NAME class variable may contain a display name for the RequestHandler.
158 By default, this is generated from the class name.
159
160 The concrete request handler MUST have "RH" as the suffix in the class name.
161
162 All exceptions raised by a RequestHandler should be an instance of RequestError.
163 Any other exception raised will be treated as a handler issue.
164
165 If a Request is not supported by the handler, an UnsupportedRequest
166 should be raised with a reason.
167
168 By default, some checks are done on the request in _validate() based on the following class variables:
169 - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
170 Any Request with an url scheme not in this list will raise an UnsupportedRequest.
171
172 - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
173 a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
174
175 - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
176
177 The above may be set to None to disable the checks.
178
179 Parameters:
180 @param logger: logger instance
181 @param headers: HTTP Headers to include when sending requests.
182 @param cookiejar: Cookiejar to use for requests.
183 @param timeout: Socket timeout to use when sending requests.
184 @param proxies: Proxies to use for sending requests.
185 @param source_address: Client-side IP address to bind to for requests.
186 @param verbose: Print debug request and traffic information to stdout.
187 @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
188 @param client_cert: SSL client certificate configuration.
189 dict with {client_certificate, client_certificate_key, client_certificate_password}
190 @param verify: Verify SSL certificates
191 @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
192
193 Some configuration options may be available for individual Requests too. In this case,
194 either the Request configuration option takes precedence or they are merged.
195
196 Requests may have additional optional parameters defined as extensions.
197 RequestHandler subclasses may choose to support custom extensions.
198
199 If an extension is supported, subclasses should extend _check_extensions(extensions)
200 to pop and validate the extension.
201 - Extensions left in `extensions` are treated as unsupported and UnsupportedRequest will be raised.
202
203 The following extensions are defined for RequestHandler:
204 - `cookiejar`: Cookiejar to use for this request.
205 - `timeout`: socket timeout to use for this request.
206 To enable these, add extensions.pop('<extension>', None) to _check_extensions
207
208 Apart from the url protocol, proxies dict may contain the following keys:
209 - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
210 - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
211 Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
212
213 """
214
215 _SUPPORTED_URL_SCHEMES = ()
216 _SUPPORTED_PROXY_SCHEMES = ()
217 _SUPPORTED_FEATURES = ()
218
219 def __init__(
220 self, *,
221 logger, # TODO(Grub4k): default logger
222 headers: HTTPHeaderDict = None,
223 cookiejar: YoutubeDLCookieJar = None,
224 timeout: float | int | None = None,
225 proxies: dict = None,
226 source_address: str = None,
227 verbose: bool = False,
228 prefer_system_certs: bool = False,
229 client_cert: dict[str, str | None] = None,
230 verify: bool = True,
231 legacy_ssl_support: bool = False,
232 **_,
233 ):
234
235 self._logger = logger
236 self.headers = headers or {}
237 self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar()
238 self.timeout = float(timeout or 20)
239 self.proxies = proxies or {}
240 self.source_address = source_address
241 self.verbose = verbose
242 self.prefer_system_certs = prefer_system_certs
243 self._client_cert = client_cert or {}
244 self.verify = verify
245 self.legacy_ssl_support = legacy_ssl_support
246 super().__init__()
247
248 def _make_sslcontext(self):
249 return make_ssl_context(
250 verify=self.verify,
251 legacy_support=self.legacy_ssl_support,
252 use_certifi=not self.prefer_system_certs,
253 **self._client_cert,
254 )
255
256 def _merge_headers(self, request_headers):
257 return HTTPHeaderDict(self.headers, request_headers)
258
259 def _calculate_timeout(self, request):
260 return float(request.extensions.get('timeout') or self.timeout)
261
262 def _get_cookiejar(self, request):
263 return request.extensions.get('cookiejar') or self.cookiejar
264
265 def _get_proxies(self, request):
266 return (request.proxies or self.proxies).copy()
267
268 def _check_url_scheme(self, request: Request):
269 scheme = urllib.parse.urlparse(request.url).scheme.lower()
270 if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
271 raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
272 return scheme # for further processing
273
274 def _check_proxies(self, proxies):
275 for proxy_key, proxy_url in proxies.items():
276 if proxy_url is None:
277 continue
278 if proxy_key == 'no':
279 if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
280 raise UnsupportedRequest('"no" proxy is not supported')
281 continue
282 if (
283 proxy_key == 'all'
284 and self._SUPPORTED_FEATURES is not None
285 and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
286 ):
287 raise UnsupportedRequest('"all" proxy is not supported')
288
289 # Unlikely this handler will use this proxy, so ignore.
290 # This is to allow a case where a proxy may be set for a protocol
291 # for one handler in which such protocol (and proxy) is not supported by another handler.
292 if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
293 continue
294
295 if self._SUPPORTED_PROXY_SCHEMES is None:
296 # Skip proxy scheme checks
297 continue
298
299 try:
300 if urllib.request._parse_proxy(proxy_url)[0] is None:
301 # Scheme-less proxies are not supported
302 raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
303 except ValueError as e:
304 # parse_proxy may raise on some invalid proxy urls such as "/a/b/c"
305 raise UnsupportedRequest(f'Invalid proxy url "{proxy_url}": {e}')
306
307 scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
308 if scheme not in self._SUPPORTED_PROXY_SCHEMES:
309 raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
310
311 def _check_extensions(self, extensions):
312 """Check extensions for unsupported extensions. Subclasses should extend this."""
313 assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType))
314 assert isinstance(extensions.get('timeout'), (float, int, NoneType))
315
316 def _validate(self, request):
317 self._check_url_scheme(request)
318 self._check_proxies(request.proxies or self.proxies)
319 extensions = request.extensions.copy()
320 self._check_extensions(extensions)
321 if extensions:
322 # TODO: add support for optional extensions
323 raise UnsupportedRequest(f'Unsupported extensions: {", ".join(extensions.keys())}')
324
325 @wrap_request_errors
326 def validate(self, request: Request):
327 if not isinstance(request, Request):
328 raise TypeError('Expected an instance of Request')
329 self._validate(request)
330
331 @wrap_request_errors
332 def send(self, request: Request) -> Response:
333 if not isinstance(request, Request):
334 raise TypeError('Expected an instance of Request')
335 return self._send(request)
336
337 @abc.abstractmethod
338 def _send(self, request: Request):
339 """Handle a request from start to finish. Redefine in subclasses."""
340 pass
341
342 def close(self):
343 pass
344
345 @classproperty
346 def RH_NAME(cls):
347 return cls.__name__[:-2]
348
349 @classproperty
350 def RH_KEY(cls):
351 assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
352 return cls.__name__[:-2]
353
354 def __enter__(self):
355 return self
356
357 def __exit__(self, *args):
358 self.close()
359
360
361 class Request:
362 """
363 Represents a request to be made.
364 Partially backwards-compatible with urllib.request.Request.
365
366 @param url: url to send. Will be sanitized.
367 @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
368 @param headers: headers to send.
369 @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
370 @param query: URL query parameters to update the url with.
371 @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
372 @param extensions: Dictionary of Request extensions to add, as supported by handlers.
373 """
374
375 def __init__(
376 self,
377 url: str,
378 data: RequestData = None,
379 headers: typing.Mapping = None,
380 proxies: dict = None,
381 query: dict = None,
382 method: str = None,
383 extensions: dict = None
384 ):
385
386 self._headers = HTTPHeaderDict()
387 self._data = None
388
389 if query:
390 url = update_url_query(url, query)
391
392 self.url = url
393 self.method = method
394 if headers:
395 self.headers = headers
396 self.data = data # note: must be done after setting headers
397 self.proxies = proxies or {}
398 self.extensions = extensions or {}
399
400 @property
401 def url(self):
402 return self._url
403
404 @url.setter
405 def url(self, url):
406 if not isinstance(url, str):
407 raise TypeError('url must be a string')
408 elif url.startswith('//'):
409 url = 'http:' + url
410 self._url = normalize_url(url)
411
412 @property
413 def method(self):
414 return self._method or ('POST' if self.data is not None else 'GET')
415
416 @method.setter
417 def method(self, method):
418 if method is None:
419 self._method = None
420 elif isinstance(method, str):
421 self._method = method.upper()
422 else:
423 raise TypeError('method must be a string')
424
425 @property
426 def data(self):
427 return self._data
428
429 @data.setter
430 def data(self, data: RequestData):
431 # Try catch some common mistakes
432 if data is not None and (
433 not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
434 ):
435 raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
436
437 if data == self._data and self._data is None:
438 self.headers.pop('Content-Length', None)
439
440 # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
441 if data != self._data:
442 if self._data is not None:
443 self.headers.pop('Content-Length', None)
444 self._data = data
445
446 if self._data is None:
447 self.headers.pop('Content-Type', None)
448
449 if 'Content-Type' not in self.headers and self._data is not None:
450 self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
451
452 @property
453 def headers(self) -> HTTPHeaderDict:
454 return self._headers
455
456 @headers.setter
457 def headers(self, new_headers: Mapping):
458 """Replaces headers of the request. If not a HTTPHeaderDict, it will be converted to one."""
459 if isinstance(new_headers, HTTPHeaderDict):
460 self._headers = new_headers
461 elif isinstance(new_headers, Mapping):
462 self._headers = HTTPHeaderDict(new_headers)
463 else:
464 raise TypeError('headers must be a mapping')
465
466 def update(self, url=None, data=None, headers=None, query=None, extensions=None):
467 self.data = data if data is not None else self.data
468 self.headers.update(headers or {})
469 self.extensions.update(extensions or {})
470 self.url = update_url_query(url or self.url, query or {})
471
472 def copy(self):
473 return self.__class__(
474 url=self.url,
475 headers=copy.deepcopy(self.headers),
476 proxies=copy.deepcopy(self.proxies),
477 data=self._data,
478 extensions=copy.copy(self.extensions),
479 method=self._method,
480 )
481
482
483 HEADRequest = functools.partial(Request, method='HEAD')
484 PUTRequest = functools.partial(Request, method='PUT')
485
486
487 class Response(io.IOBase):
488 """
489 Base class for HTTP response adapters.
490
491 By default, it provides a basic wrapper for a file-like response object.
492
493 Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
494
495 @param fp: Original, file-like, response.
496 @param url: URL that this is a response of.
497 @param headers: response headers.
498 @param status: Response HTTP status code. Default is 200 OK.
499 @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
500 """
501
502 def __init__(
503 self,
504 fp: io.IOBase,
505 url: str,
506 headers: Mapping[str, str],
507 status: int = 200,
508 reason: str = None):
509
510 self.fp = fp
511 self.headers = Message()
512 for name, value in headers.items():
513 self.headers.add_header(name, value)
514 self.status = status
515 self.url = url
516 try:
517 self.reason = reason or HTTPStatus(status).phrase
518 except ValueError:
519 self.reason = None
520
521 def readable(self):
522 return self.fp.readable()
523
524 def read(self, amt: int = None) -> bytes:
525 # Expected errors raised here should be of type RequestError or subclasses.
526 # Subclasses should redefine this method with more precise error handling.
527 try:
528 return self.fp.read(amt)
529 except Exception as e:
530 raise TransportError(cause=e) from e
531
532 def close(self):
533 self.fp.close()
534 return super().close()
535
536 def get_header(self, name, default=None):
537 """Get header for name.
538 If there are multiple matching headers, return all seperated by comma."""
539 headers = self.headers.get_all(name)
540 if not headers:
541 return default
542 if name.title() == 'Set-Cookie':
543 # Special case, only get the first one
544 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
545 return headers[0]
546 return ', '.join(headers)
547
548 # The following methods are for compatability reasons and are deprecated
549 @property
550 def code(self):
551 deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2)
552 return self.status
553
554 def getcode(self):
555 deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2)
556 return self.status
557
558 def geturl(self):
559 deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2)
560 return self.url
561
562 def info(self):
563 deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2)
564 return self.headers
565
566 def getheader(self, name, default=None):
567 deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2)
568 return self.get_header(name, default)
569
570
571 if typing.TYPE_CHECKING:
572 RequestData = bytes | Iterable[bytes] | typing.IO | None
573 Preference = typing.Callable[[RequestHandler, Request], int]
574
575 _RH_PREFERENCES: set[Preference] = set()