]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_urllib.py
[cleanup] Add more ruff rules (#10149)
[yt-dlp.git] / yt_dlp / networking / _urllib.py
1 from __future__ import annotations
2
3 import functools
4 import http.client
5 import io
6 import ssl
7 import urllib.error
8 import urllib.parse
9 import urllib.request
10 import urllib.response
11 import zlib
12 from urllib.request import (
13 DataHandler,
14 FileHandler,
15 FTPHandler,
16 HTTPCookieProcessor,
17 HTTPDefaultErrorHandler,
18 HTTPErrorProcessor,
19 UnknownHandler,
20 )
21
22 from ._helper import (
23 InstanceStoreMixin,
24 add_accept_encoding_header,
25 create_connection,
26 create_socks_proxy_socket,
27 get_redirect_method,
28 make_socks_proxy_opts,
29 select_proxy,
30 )
31 from .common import Features, RequestHandler, Response, register_rh
32 from .exceptions import (
33 CertificateVerifyError,
34 HTTPError,
35 IncompleteRead,
36 ProxyError,
37 RequestError,
38 SSLError,
39 TransportError,
40 )
41 from ..dependencies import brotli
42 from ..socks import ProxyError as SocksProxyError
43 from ..utils import update_url_query
44 from ..utils.networking import normalize_url
45
46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
48
49 if brotli:
50 SUPPORTED_ENCODINGS.append('br')
51 CONTENT_DECODE_ERRORS.append(brotli.error)
52
53
54 def _create_http_connection(http_class, source_address, *args, **kwargs):
55 hc = http_class(*args, **kwargs)
56
57 if hasattr(hc, '_create_connection'):
58 hc._create_connection = create_connection
59
60 if source_address is not None:
61 hc.source_address = (source_address, 0)
62
63 return hc
64
65
66 class HTTPHandler(urllib.request.AbstractHTTPHandler):
67 """Handler for HTTP requests and responses.
68
69 This class, when installed with an OpenerDirector, automatically adds
70 the standard headers to every HTTP request and handles gzipped, deflated and
71 brotli responses from web servers.
72
73 Part of this code was copied from:
74
75 http://techknack.net/python-urllib2-handlers/
76
77 Andrew Rowls, the author of that code, agreed to release it to the
78 public domain.
79 """
80
81 def __init__(self, context=None, source_address=None, *args, **kwargs):
82 super().__init__(*args, **kwargs)
83 self._source_address = source_address
84 self._context = context
85
86 @staticmethod
87 def _make_conn_class(base, req):
88 conn_class = base
89 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
90 if socks_proxy:
91 conn_class = make_socks_conn_class(conn_class, socks_proxy)
92 return conn_class
93
94 def http_open(self, req):
95 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
96 return self.do_open(functools.partial(
97 _create_http_connection, conn_class, self._source_address), req)
98
99 def https_open(self, req):
100 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
101 return self.do_open(
102 functools.partial(
103 _create_http_connection, conn_class, self._source_address),
104 req, context=self._context)
105
106 @staticmethod
107 def deflate(data):
108 if not data:
109 return data
110 try:
111 return zlib.decompress(data, -zlib.MAX_WBITS)
112 except zlib.error:
113 return zlib.decompress(data)
114
115 @staticmethod
116 def brotli(data):
117 if not data:
118 return data
119 return brotli.decompress(data)
120
121 @staticmethod
122 def gz(data):
123 # There may be junk added the end of the file
124 # We ignore it by only ever decoding a single gzip payload
125 if not data:
126 return data
127 return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
128
129 def http_request(self, req):
130 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
131 # always respected by websites, some tend to give out URLs with non percent-encoded
132 # non-ASCII characters (see telemb.py, ard.py [#3412])
133 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
134 # To work around aforementioned issue we will replace request's original URL with
135 # percent-encoded one
136 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
137 # the code of this workaround has been moved here from YoutubeDL.urlopen()
138 url = req.get_full_url()
139 url_escaped = normalize_url(url)
140
141 # Substitute URL if any change after escaping
142 if url != url_escaped:
143 req = update_Request(req, url=url_escaped)
144
145 return super().do_request_(req)
146
147 def http_response(self, req, resp):
148 old_resp = resp
149
150 # Content-Encoding header lists the encodings in order that they were applied [1].
151 # To decompress, we simply do the reverse.
152 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
153 decoded_response = None
154 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
155 if encoding == 'gzip':
156 decoded_response = self.gz(decoded_response or resp.read())
157 elif encoding == 'deflate':
158 decoded_response = self.deflate(decoded_response or resp.read())
159 elif encoding == 'br' and brotli:
160 decoded_response = self.brotli(decoded_response or resp.read())
161
162 if decoded_response is not None:
163 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
164 resp.msg = old_resp.msg
165 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
166 # https://github.com/ytdl-org/youtube-dl/issues/6457).
167 if 300 <= resp.code < 400:
168 location = resp.headers.get('Location')
169 if location:
170 # As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
171 location = location.encode('iso-8859-1').decode()
172 location_escaped = normalize_url(location)
173 if location != location_escaped:
174 del resp.headers['Location']
175 resp.headers['Location'] = location_escaped
176 return resp
177
178 https_request = http_request
179 https_response = http_response
180
181
182 def make_socks_conn_class(base_class, socks_proxy):
183 assert issubclass(base_class, (
184 http.client.HTTPConnection, http.client.HTTPSConnection))
185
186 proxy_args = make_socks_proxy_opts(socks_proxy)
187
188 class SocksConnection(base_class):
189 _create_connection = create_connection
190
191 def connect(self):
192 self.sock = create_connection(
193 (proxy_args['addr'], proxy_args['port']),
194 timeout=self.timeout,
195 source_address=self.source_address,
196 _create_socket_func=functools.partial(
197 create_socks_proxy_socket, (self.host, self.port), proxy_args))
198 if isinstance(self, http.client.HTTPSConnection):
199 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
200
201 return SocksConnection
202
203
204 class RedirectHandler(urllib.request.HTTPRedirectHandler):
205 """YoutubeDL redirect handler
206
207 The code is based on HTTPRedirectHandler implementation from CPython [1].
208
209 This redirect handler fixes and improves the logic to better align with RFC7261
210 and what browsers tend to do [2][3]
211
212 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
213 2. https://datatracker.ietf.org/doc/html/rfc7231
214 3. https://github.com/python/cpython/issues/91306
215 """
216
217 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
218
219 def redirect_request(self, req, fp, code, msg, headers, newurl):
220 if code not in (301, 302, 303, 307, 308):
221 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
222
223 new_data = req.data
224
225 # Technically the Cookie header should be in unredirected_hdrs,
226 # however in practice some may set it in normal headers anyway.
227 # We will remove it here to prevent any leaks.
228 remove_headers = ['Cookie']
229
230 new_method = get_redirect_method(req.get_method(), code)
231 # only remove payload if method changed (e.g. POST to GET)
232 if new_method != req.get_method():
233 new_data = None
234 remove_headers.extend(['Content-Length', 'Content-Type'])
235
236 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
237
238 return urllib.request.Request(
239 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
240 unverifiable=True, method=new_method, data=new_data)
241
242
243 class ProxyHandler(urllib.request.BaseHandler):
244 handler_order = 100
245
246 def __init__(self, proxies=None):
247 self.proxies = proxies
248 # Set default handlers
249 for scheme in ('http', 'https', 'ftp'):
250 setattr(self, f'{scheme}_open', lambda r, meth=self.proxy_open: meth(r))
251
252 def proxy_open(self, req):
253 proxy = select_proxy(req.get_full_url(), self.proxies)
254 if proxy is None:
255 return
256 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
257 req.add_header('Ytdl-socks-proxy', proxy)
258 # yt-dlp's http/https handlers do wrapping the socket with socks
259 return None
260 return urllib.request.ProxyHandler.proxy_open(
261 self, req, proxy, None)
262
263
264 class PUTRequest(urllib.request.Request):
265 def get_method(self):
266 return 'PUT'
267
268
269 class HEADRequest(urllib.request.Request):
270 def get_method(self):
271 return 'HEAD'
272
273
274 def update_Request(req, url=None, data=None, headers=None, query=None):
275 req_headers = req.headers.copy()
276 req_headers.update(headers or {})
277 req_data = data if data is not None else req.data
278 req_url = update_url_query(url or req.get_full_url(), query)
279 req_get_method = req.get_method()
280 if req_get_method == 'HEAD':
281 req_type = HEADRequest
282 elif req_get_method == 'PUT':
283 req_type = PUTRequest
284 else:
285 req_type = urllib.request.Request
286 new_req = req_type(
287 req_url, data=req_data, headers=req_headers,
288 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
289 if hasattr(req, 'timeout'):
290 new_req.timeout = req.timeout
291 return new_req
292
293
294 class UrllibResponseAdapter(Response):
295 """
296 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
297 """
298
299 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
300 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
301 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
302 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
303 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
304 super().__init__(
305 fp=res, headers=res.headers, url=res.url,
306 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
307
308 def read(self, amt=None):
309 try:
310 return self.fp.read(amt)
311 except Exception as e:
312 handle_response_read_exceptions(e)
313 raise e
314
315
316 def handle_sslerror(e: ssl.SSLError):
317 if not isinstance(e, ssl.SSLError):
318 return
319 if isinstance(e, ssl.SSLCertVerificationError):
320 raise CertificateVerifyError(cause=e) from e
321 raise SSLError(cause=e) from e
322
323
324 def handle_response_read_exceptions(e):
325 if isinstance(e, http.client.IncompleteRead):
326 raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
327 elif isinstance(e, ssl.SSLError):
328 handle_sslerror(e)
329 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
330 # OSErrors raised here should mostly be network related
331 raise TransportError(cause=e) from e
332
333
334 @register_rh
335 class UrllibRH(RequestHandler, InstanceStoreMixin):
336 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
337 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
338 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
339 RH_NAME = 'urllib'
340
341 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
342 super().__init__(**kwargs)
343 self.enable_file_urls = enable_file_urls
344 if self.enable_file_urls:
345 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
346
347 def _check_extensions(self, extensions):
348 super()._check_extensions(extensions)
349 extensions.pop('cookiejar', None)
350 extensions.pop('timeout', None)
351
352 def _create_instance(self, proxies, cookiejar):
353 opener = urllib.request.OpenerDirector()
354 handlers = [
355 ProxyHandler(proxies),
356 HTTPHandler(
357 debuglevel=int(bool(self.verbose)),
358 context=self._make_sslcontext(),
359 source_address=self.source_address),
360 HTTPCookieProcessor(cookiejar),
361 DataHandler(),
362 UnknownHandler(),
363 HTTPDefaultErrorHandler(),
364 FTPHandler(),
365 HTTPErrorProcessor(),
366 RedirectHandler(),
367 ]
368
369 if self.enable_file_urls:
370 handlers.append(FileHandler())
371
372 for handler in handlers:
373 opener.add_handler(handler)
374
375 # Delete the default user-agent header, which would otherwise apply in
376 # cases where our custom HTTP handler doesn't come into play
377 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
378 opener.addheaders = []
379 return opener
380
381 def _send(self, request):
382 headers = self._merge_headers(request.headers)
383 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
384 urllib_req = urllib.request.Request(
385 url=request.url,
386 data=request.data,
387 headers=dict(headers),
388 method=request.method,
389 )
390
391 opener = self._get_instance(
392 proxies=self._get_proxies(request),
393 cookiejar=self._get_cookiejar(request),
394 )
395 try:
396 res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
397 except urllib.error.HTTPError as e:
398 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
399 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
400 e._closer.close_called = True
401 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
402 raise # unexpected
403 except urllib.error.URLError as e:
404 cause = e.reason # NOTE: cause may be a string
405
406 # proxy errors
407 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
408 raise ProxyError(cause=e) from e
409
410 handle_response_read_exceptions(cause)
411 raise TransportError(cause=e) from e
412 except (http.client.InvalidURL, ValueError) as e:
413 # Validation errors
414 # http.client.HTTPConnection raises ValueError in some validation cases
415 # such as if request method contains illegal control characters [1]
416 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
417 raise RequestError(cause=e) from e
418 except Exception as e:
419 handle_response_read_exceptions(e)
420 raise # unexpected
421
422 return UrllibResponseAdapter(res)