]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/common.py
[networking] Remove dot segments during URL normalization (#7662)
[yt-dlp.git] / yt_dlp / networking / common.py
1 from __future__ import annotations
2
3 import abc
4 import copy
5 import enum
6 import functools
7 import io
8 import typing
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 from collections.abc import Iterable, Mapping
13 from email.message import Message
14 from http import HTTPStatus
15 from http.cookiejar import CookieJar
16
17 from ._helper import make_ssl_context, wrap_request_errors
18 from .exceptions import (
19 NoSupportingHandlers,
20 RequestError,
21 TransportError,
22 UnsupportedRequest,
23 )
24 from ..compat.types import NoneType
25 from ..utils import (
26 bug_reports_message,
27 classproperty,
28 deprecation_warning,
29 error_to_str,
30 update_url_query,
31 )
32 from ..utils.networking import HTTPHeaderDict, normalize_url
33
34 if typing.TYPE_CHECKING:
35 RequestData = bytes | Iterable[bytes] | typing.IO | None
36
37
38 class RequestDirector:
39 """RequestDirector class
40
41 Helper class that, when given a request, forward it to a RequestHandler that supports it.
42
43 @param logger: Logger instance.
44 @param verbose: Print debug request information to stdout.
45 """
46
47 def __init__(self, logger, verbose=False):
48 self.handlers: dict[str, RequestHandler] = {}
49 self.logger = logger # TODO(Grub4k): default logger
50 self.verbose = verbose
51
52 def close(self):
53 for handler in self.handlers.values():
54 handler.close()
55
56 def add_handler(self, handler: RequestHandler):
57 """Add a handler. If a handler of the same RH_KEY exists, it will overwrite it"""
58 assert isinstance(handler, RequestHandler), 'handler must be a RequestHandler'
59 self.handlers[handler.RH_KEY] = handler
60
61 def _print_verbose(self, msg):
62 if self.verbose:
63 self.logger.stdout(f'director: {msg}')
64
65 def send(self, request: Request) -> Response:
66 """
67 Passes a request onto a suitable RequestHandler
68 """
69 if not self.handlers:
70 raise RequestError('No request handlers configured')
71
72 assert isinstance(request, Request)
73
74 unexpected_errors = []
75 unsupported_errors = []
76 # TODO (future): add a per-request preference system
77 for handler in reversed(list(self.handlers.values())):
78 self._print_verbose(f'Checking if "{handler.RH_NAME}" supports this request.')
79 try:
80 handler.validate(request)
81 except UnsupportedRequest as e:
82 self._print_verbose(
83 f'"{handler.RH_NAME}" cannot handle this request (reason: {error_to_str(e)})')
84 unsupported_errors.append(e)
85 continue
86
87 self._print_verbose(f'Sending request via "{handler.RH_NAME}"')
88 try:
89 response = handler.send(request)
90 except RequestError:
91 raise
92 except Exception as e:
93 self.logger.error(
94 f'[{handler.RH_NAME}] Unexpected error: {error_to_str(e)}{bug_reports_message()}',
95 is_error=False)
96 unexpected_errors.append(e)
97 continue
98
99 assert isinstance(response, Response)
100 return response
101
102 raise NoSupportingHandlers(unsupported_errors, unexpected_errors)
103
104
105 _REQUEST_HANDLERS = {}
106
107
108 def register_rh(handler):
109 """Register a RequestHandler class"""
110 assert issubclass(handler, RequestHandler), f'{handler} must be a subclass of RequestHandler'
111 assert handler.RH_KEY not in _REQUEST_HANDLERS, f'RequestHandler {handler.RH_KEY} already registered'
112 _REQUEST_HANDLERS[handler.RH_KEY] = handler
113 return handler
114
115
116 class Features(enum.Enum):
117 ALL_PROXY = enum.auto()
118 NO_PROXY = enum.auto()
119
120
121 class RequestHandler(abc.ABC):
122
123 """Request Handler class
124
125 Request handlers are class that, given a Request,
126 process the request from start to finish and return a Response.
127
128 Concrete subclasses need to redefine the _send(request) method,
129 which handles the underlying request logic and returns a Response.
130
131 RH_NAME class variable may contain a display name for the RequestHandler.
132 By default, this is generated from the class name.
133
134 The concrete request handler MUST have "RH" as the suffix in the class name.
135
136 All exceptions raised by a RequestHandler should be an instance of RequestError.
137 Any other exception raised will be treated as a handler issue.
138
139 If a Request is not supported by the handler, an UnsupportedRequest
140 should be raised with a reason.
141
142 By default, some checks are done on the request in _validate() based on the following class variables:
143 - `_SUPPORTED_URL_SCHEMES`: a tuple of supported url schemes.
144 Any Request with an url scheme not in this list will raise an UnsupportedRequest.
145
146 - `_SUPPORTED_PROXY_SCHEMES`: a tuple of support proxy url schemes. Any Request that contains
147 a proxy url with an url scheme not in this list will raise an UnsupportedRequest.
148
149 - `_SUPPORTED_FEATURES`: a tuple of supported features, as defined in Features enum.
150
151 The above may be set to None to disable the checks.
152
153 Parameters:
154 @param logger: logger instance
155 @param headers: HTTP Headers to include when sending requests.
156 @param cookiejar: Cookiejar to use for requests.
157 @param timeout: Socket timeout to use when sending requests.
158 @param proxies: Proxies to use for sending requests.
159 @param source_address: Client-side IP address to bind to for requests.
160 @param verbose: Print debug request and traffic information to stdout.
161 @param prefer_system_certs: Whether to prefer system certificates over other means (e.g. certifi).
162 @param client_cert: SSL client certificate configuration.
163 dict with {client_certificate, client_certificate_key, client_certificate_password}
164 @param verify: Verify SSL certificates
165 @param legacy_ssl_support: Enable legacy SSL options such as legacy server connect and older cipher support.
166
167 Some configuration options may be available for individual Requests too. In this case,
168 either the Request configuration option takes precedence or they are merged.
169
170 Requests may have additional optional parameters defined as extensions.
171 RequestHandler subclasses may choose to support custom extensions.
172
173 If an extension is supported, subclasses should extend _check_extensions(extensions)
174 to pop and validate the extension.
175 - Extensions left in `extensions` are treated as unsupported and UnsupportedRequest will be raised.
176
177 The following extensions are defined for RequestHandler:
178 - `cookiejar`: Cookiejar to use for this request.
179 - `timeout`: socket timeout to use for this request.
180 To enable these, add extensions.pop('<extension>', None) to _check_extensions
181
182 Apart from the url protocol, proxies dict may contain the following keys:
183 - `all`: proxy to use for all protocols. Used as a fallback if no proxy is set for a specific protocol.
184 - `no`: comma seperated list of hostnames (optionally with port) to not use a proxy for.
185 Note: a RequestHandler may not support these, as defined in `_SUPPORTED_FEATURES`.
186
187 """
188
189 _SUPPORTED_URL_SCHEMES = ()
190 _SUPPORTED_PROXY_SCHEMES = ()
191 _SUPPORTED_FEATURES = ()
192
193 def __init__(
194 self, *,
195 logger, # TODO(Grub4k): default logger
196 headers: HTTPHeaderDict = None,
197 cookiejar: CookieJar = None,
198 timeout: float | int | None = None,
199 proxies: dict = None,
200 source_address: str = None,
201 verbose: bool = False,
202 prefer_system_certs: bool = False,
203 client_cert: dict[str, str | None] = None,
204 verify: bool = True,
205 legacy_ssl_support: bool = False,
206 **_,
207 ):
208
209 self._logger = logger
210 self.headers = headers or {}
211 self.cookiejar = cookiejar if cookiejar is not None else CookieJar()
212 self.timeout = float(timeout or 20)
213 self.proxies = proxies or {}
214 self.source_address = source_address
215 self.verbose = verbose
216 self.prefer_system_certs = prefer_system_certs
217 self._client_cert = client_cert or {}
218 self.verify = verify
219 self.legacy_ssl_support = legacy_ssl_support
220 super().__init__()
221
222 def _make_sslcontext(self):
223 return make_ssl_context(
224 verify=self.verify,
225 legacy_support=self.legacy_ssl_support,
226 use_certifi=not self.prefer_system_certs,
227 **self._client_cert,
228 )
229
230 def _merge_headers(self, request_headers):
231 return HTTPHeaderDict(self.headers, request_headers)
232
233 def _check_url_scheme(self, request: Request):
234 scheme = urllib.parse.urlparse(request.url).scheme.lower()
235 if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
236 raise UnsupportedRequest(f'Unsupported url scheme: "{scheme}"')
237 return scheme # for further processing
238
239 def _check_proxies(self, proxies):
240 for proxy_key, proxy_url in proxies.items():
241 if proxy_url is None:
242 continue
243 if proxy_key == 'no':
244 if self._SUPPORTED_FEATURES is not None and Features.NO_PROXY not in self._SUPPORTED_FEATURES:
245 raise UnsupportedRequest('"no" proxy is not supported')
246 continue
247 if (
248 proxy_key == 'all'
249 and self._SUPPORTED_FEATURES is not None
250 and Features.ALL_PROXY not in self._SUPPORTED_FEATURES
251 ):
252 raise UnsupportedRequest('"all" proxy is not supported')
253
254 # Unlikely this handler will use this proxy, so ignore.
255 # This is to allow a case where a proxy may be set for a protocol
256 # for one handler in which such protocol (and proxy) is not supported by another handler.
257 if self._SUPPORTED_URL_SCHEMES is not None and proxy_key not in (*self._SUPPORTED_URL_SCHEMES, 'all'):
258 continue
259
260 if self._SUPPORTED_PROXY_SCHEMES is None:
261 # Skip proxy scheme checks
262 continue
263
264 try:
265 if urllib.request._parse_proxy(proxy_url)[0] is None:
266 # Scheme-less proxies are not supported
267 raise UnsupportedRequest(f'Proxy "{proxy_url}" missing scheme')
268 except ValueError as e:
269 # parse_proxy may raise on some invalid proxy urls such as "/a/b/c"
270 raise UnsupportedRequest(f'Invalid proxy url "{proxy_url}": {e}')
271
272 scheme = urllib.parse.urlparse(proxy_url).scheme.lower()
273 if scheme not in self._SUPPORTED_PROXY_SCHEMES:
274 raise UnsupportedRequest(f'Unsupported proxy type: "{scheme}"')
275
276 def _check_extensions(self, extensions):
277 """Check extensions for unsupported extensions. Subclasses should extend this."""
278 assert isinstance(extensions.get('cookiejar'), (CookieJar, NoneType))
279 assert isinstance(extensions.get('timeout'), (float, int, NoneType))
280
281 def _validate(self, request):
282 self._check_url_scheme(request)
283 self._check_proxies(request.proxies or self.proxies)
284 extensions = request.extensions.copy()
285 self._check_extensions(extensions)
286 if extensions:
287 # TODO: add support for optional extensions
288 raise UnsupportedRequest(f'Unsupported extensions: {", ".join(extensions.keys())}')
289
290 @wrap_request_errors
291 def validate(self, request: Request):
292 if not isinstance(request, Request):
293 raise TypeError('Expected an instance of Request')
294 self._validate(request)
295
296 @wrap_request_errors
297 def send(self, request: Request) -> Response:
298 if not isinstance(request, Request):
299 raise TypeError('Expected an instance of Request')
300 return self._send(request)
301
302 @abc.abstractmethod
303 def _send(self, request: Request):
304 """Handle a request from start to finish. Redefine in subclasses."""
305
306 def close(self):
307 pass
308
309 @classproperty
310 def RH_NAME(cls):
311 return cls.__name__[:-2]
312
313 @classproperty
314 def RH_KEY(cls):
315 assert cls.__name__.endswith('RH'), 'RequestHandler class names must end with "RH"'
316 return cls.__name__[:-2]
317
318 def __enter__(self):
319 return self
320
321 def __exit__(self, *args):
322 self.close()
323
324
325 class Request:
326 """
327 Represents a request to be made.
328 Partially backwards-compatible with urllib.request.Request.
329
330 @param url: url to send. Will be sanitized.
331 @param data: payload data to send. Must be bytes, iterable of bytes, a file-like object or None
332 @param headers: headers to send.
333 @param proxies: proxy dict mapping of proto:proxy to use for the request and any redirects.
334 @param query: URL query parameters to update the url with.
335 @param method: HTTP method to use. If no method specified, will use POST if payload data is present else GET
336 @param extensions: Dictionary of Request extensions to add, as supported by handlers.
337 """
338
339 def __init__(
340 self,
341 url: str,
342 data: RequestData = None,
343 headers: typing.Mapping = None,
344 proxies: dict = None,
345 query: dict = None,
346 method: str = None,
347 extensions: dict = None
348 ):
349
350 self._headers = HTTPHeaderDict()
351 self._data = None
352
353 if query:
354 url = update_url_query(url, query)
355
356 self.url = url
357 self.method = method
358 if headers:
359 self.headers = headers
360 self.data = data # note: must be done after setting headers
361 self.proxies = proxies or {}
362 self.extensions = extensions or {}
363
364 @property
365 def url(self):
366 return self._url
367
368 @url.setter
369 def url(self, url):
370 if not isinstance(url, str):
371 raise TypeError('url must be a string')
372 elif url.startswith('//'):
373 url = 'http:' + url
374 self._url = normalize_url(url)
375
376 @property
377 def method(self):
378 return self._method or ('POST' if self.data is not None else 'GET')
379
380 @method.setter
381 def method(self, method):
382 if method is None:
383 self._method = None
384 elif isinstance(method, str):
385 self._method = method.upper()
386 else:
387 raise TypeError('method must be a string')
388
389 @property
390 def data(self):
391 return self._data
392
393 @data.setter
394 def data(self, data: RequestData):
395 # Try catch some common mistakes
396 if data is not None and (
397 not isinstance(data, (bytes, io.IOBase, Iterable)) or isinstance(data, (str, Mapping))
398 ):
399 raise TypeError('data must be bytes, iterable of bytes, or a file-like object')
400
401 if data == self._data and self._data is None:
402 self.headers.pop('Content-Length', None)
403
404 # https://docs.python.org/3/library/urllib.request.html#urllib.request.Request.data
405 if data != self._data:
406 if self._data is not None:
407 self.headers.pop('Content-Length', None)
408 self._data = data
409
410 if self._data is None:
411 self.headers.pop('Content-Type', None)
412
413 if 'Content-Type' not in self.headers and self._data is not None:
414 self.headers['Content-Type'] = 'application/x-www-form-urlencoded'
415
416 @property
417 def headers(self) -> HTTPHeaderDict:
418 return self._headers
419
420 @headers.setter
421 def headers(self, new_headers: Mapping):
422 """Replaces headers of the request. If not a CaseInsensitiveDict, it will be converted to one."""
423 if isinstance(new_headers, HTTPHeaderDict):
424 self._headers = new_headers
425 elif isinstance(new_headers, Mapping):
426 self._headers = HTTPHeaderDict(new_headers)
427 else:
428 raise TypeError('headers must be a mapping')
429
430 def update(self, url=None, data=None, headers=None, query=None):
431 self.data = data if data is not None else self.data
432 self.headers.update(headers or {})
433 self.url = update_url_query(url or self.url, query or {})
434
435 def copy(self):
436 return self.__class__(
437 url=self.url,
438 headers=copy.deepcopy(self.headers),
439 proxies=copy.deepcopy(self.proxies),
440 data=self._data,
441 extensions=copy.copy(self.extensions),
442 method=self._method,
443 )
444
445
446 HEADRequest = functools.partial(Request, method='HEAD')
447 PUTRequest = functools.partial(Request, method='PUT')
448
449
450 class Response(io.IOBase):
451 """
452 Base class for HTTP response adapters.
453
454 By default, it provides a basic wrapper for a file-like response object.
455
456 Interface partially backwards-compatible with addinfourl and http.client.HTTPResponse.
457
458 @param fp: Original, file-like, response.
459 @param url: URL that this is a response of.
460 @param headers: response headers.
461 @param status: Response HTTP status code. Default is 200 OK.
462 @param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
463 """
464
465 def __init__(
466 self,
467 fp: typing.IO,
468 url: str,
469 headers: Mapping[str, str],
470 status: int = 200,
471 reason: str = None):
472
473 self.fp = fp
474 self.headers = Message()
475 for name, value in headers.items():
476 self.headers.add_header(name, value)
477 self.status = status
478 self.url = url
479 try:
480 self.reason = reason or HTTPStatus(status).phrase
481 except ValueError:
482 self.reason = None
483
484 def readable(self):
485 return self.fp.readable()
486
487 def read(self, amt: int = None) -> bytes:
488 # Expected errors raised here should be of type RequestError or subclasses.
489 # Subclasses should redefine this method with more precise error handling.
490 try:
491 return self.fp.read(amt)
492 except Exception as e:
493 raise TransportError(cause=e) from e
494
495 def close(self):
496 self.fp.close()
497 return super().close()
498
499 def get_header(self, name, default=None):
500 """Get header for name.
501 If there are multiple matching headers, return all seperated by comma."""
502 headers = self.headers.get_all(name)
503 if not headers:
504 return default
505 if name.title() == 'Set-Cookie':
506 # Special case, only get the first one
507 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.3-4.1
508 return headers[0]
509 return ', '.join(headers)
510
511 # The following methods are for compatability reasons and are deprecated
512 @property
513 def code(self):
514 deprecation_warning('Response.code is deprecated, use Response.status', stacklevel=2)
515 return self.status
516
517 def getcode(self):
518 deprecation_warning('Response.getcode() is deprecated, use Response.status', stacklevel=2)
519 return self.status
520
521 def geturl(self):
522 deprecation_warning('Response.geturl() is deprecated, use Response.url', stacklevel=2)
523 return self.url
524
525 def info(self):
526 deprecation_warning('Response.info() is deprecated, use Response.headers', stacklevel=2)
527 return self.headers
528
529 def getheader(self, name, default=None):
530 deprecation_warning('Response.getheader() is deprecated, use Response.get_header', stacklevel=2)
531 return self.get_header(name, default)