]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_urllib.py
[cleanup] Misc (#8182)
[yt-dlp.git] / yt_dlp / networking / _urllib.py
1 from __future__ import annotations
2
3 import functools
4 import http.client
5 import io
6 import socket
7 import ssl
8 import urllib.error
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 import zlib
13 from urllib.request import (
14 DataHandler,
15 FileHandler,
16 FTPHandler,
17 HTTPCookieProcessor,
18 HTTPDefaultErrorHandler,
19 HTTPErrorProcessor,
20 UnknownHandler,
21 )
22
23 from ._helper import (
24 InstanceStoreMixin,
25 add_accept_encoding_header,
26 create_connection,
27 get_redirect_method,
28 make_socks_proxy_opts,
29 select_proxy,
30 )
31 from .common import Features, RequestHandler, Response, register_rh
32 from .exceptions import (
33 CertificateVerifyError,
34 HTTPError,
35 IncompleteRead,
36 ProxyError,
37 RequestError,
38 SSLError,
39 TransportError,
40 )
41 from ..dependencies import brotli
42 from ..socks import ProxyError as SocksProxyError
43 from ..socks import sockssocket
44 from ..utils import update_url_query
45 from ..utils.networking import normalize_url
46
47 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
48 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
49
50 if brotli:
51 SUPPORTED_ENCODINGS.append('br')
52 CONTENT_DECODE_ERRORS.append(brotli.error)
53
54
55 def _create_http_connection(http_class, source_address, *args, **kwargs):
56 hc = http_class(*args, **kwargs)
57
58 if hasattr(hc, '_create_connection'):
59 hc._create_connection = create_connection
60
61 if source_address is not None:
62 hc.source_address = (source_address, 0)
63
64 return hc
65
66
67 class HTTPHandler(urllib.request.AbstractHTTPHandler):
68 """Handler for HTTP requests and responses.
69
70 This class, when installed with an OpenerDirector, automatically adds
71 the standard headers to every HTTP request and handles gzipped, deflated and
72 brotli responses from web servers.
73
74 Part of this code was copied from:
75
76 http://techknack.net/python-urllib2-handlers/
77
78 Andrew Rowls, the author of that code, agreed to release it to the
79 public domain.
80 """
81
82 def __init__(self, context=None, source_address=None, *args, **kwargs):
83 super().__init__(*args, **kwargs)
84 self._source_address = source_address
85 self._context = context
86
87 @staticmethod
88 def _make_conn_class(base, req):
89 conn_class = base
90 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
91 if socks_proxy:
92 conn_class = make_socks_conn_class(conn_class, socks_proxy)
93 return conn_class
94
95 def http_open(self, req):
96 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
97 return self.do_open(functools.partial(
98 _create_http_connection, conn_class, self._source_address), req)
99
100 def https_open(self, req):
101 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
102 return self.do_open(
103 functools.partial(
104 _create_http_connection, conn_class, self._source_address),
105 req, context=self._context)
106
107 @staticmethod
108 def deflate(data):
109 if not data:
110 return data
111 try:
112 return zlib.decompress(data, -zlib.MAX_WBITS)
113 except zlib.error:
114 return zlib.decompress(data)
115
116 @staticmethod
117 def brotli(data):
118 if not data:
119 return data
120 return brotli.decompress(data)
121
122 @staticmethod
123 def gz(data):
124 # There may be junk added the end of the file
125 # We ignore it by only ever decoding a single gzip payload
126 if not data:
127 return data
128 return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
129
130 def http_request(self, req):
131 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
132 # always respected by websites, some tend to give out URLs with non percent-encoded
133 # non-ASCII characters (see telemb.py, ard.py [#3412])
134 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
135 # To work around aforementioned issue we will replace request's original URL with
136 # percent-encoded one
137 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
138 # the code of this workaround has been moved here from YoutubeDL.urlopen()
139 url = req.get_full_url()
140 url_escaped = normalize_url(url)
141
142 # Substitute URL if any change after escaping
143 if url != url_escaped:
144 req = update_Request(req, url=url_escaped)
145
146 return super().do_request_(req)
147
148 def http_response(self, req, resp):
149 old_resp = resp
150
151 # Content-Encoding header lists the encodings in order that they were applied [1].
152 # To decompress, we simply do the reverse.
153 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
154 decoded_response = None
155 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
156 if encoding == 'gzip':
157 decoded_response = self.gz(decoded_response or resp.read())
158 elif encoding == 'deflate':
159 decoded_response = self.deflate(decoded_response or resp.read())
160 elif encoding == 'br' and brotli:
161 decoded_response = self.brotli(decoded_response or resp.read())
162
163 if decoded_response is not None:
164 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
165 resp.msg = old_resp.msg
166 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
167 # https://github.com/ytdl-org/youtube-dl/issues/6457).
168 if 300 <= resp.code < 400:
169 location = resp.headers.get('Location')
170 if location:
171 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
172 location = location.encode('iso-8859-1').decode()
173 location_escaped = normalize_url(location)
174 if location != location_escaped:
175 del resp.headers['Location']
176 resp.headers['Location'] = location_escaped
177 return resp
178
179 https_request = http_request
180 https_response = http_response
181
182
183 def make_socks_conn_class(base_class, socks_proxy):
184 assert issubclass(base_class, (
185 http.client.HTTPConnection, http.client.HTTPSConnection))
186
187 proxy_args = make_socks_proxy_opts(socks_proxy)
188
189 class SocksConnection(base_class):
190 _create_connection = create_connection
191
192 def connect(self):
193 def sock_socket_connect(ip_addr, timeout, source_address):
194 af, socktype, proto, canonname, sa = ip_addr
195 sock = sockssocket(af, socktype, proto)
196 try:
197 connect_proxy_args = proxy_args.copy()
198 connect_proxy_args.update({'addr': sa[0], 'port': sa[1]})
199 sock.setproxy(**connect_proxy_args)
200 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721
201 sock.settimeout(timeout)
202 if source_address:
203 sock.bind(source_address)
204 sock.connect((self.host, self.port))
205 return sock
206 except socket.error:
207 sock.close()
208 raise
209 self.sock = create_connection(
210 (proxy_args['addr'], proxy_args['port']), timeout=self.timeout,
211 source_address=self.source_address, _create_socket_func=sock_socket_connect)
212 if isinstance(self, http.client.HTTPSConnection):
213 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
214
215 return SocksConnection
216
217
218 class RedirectHandler(urllib.request.HTTPRedirectHandler):
219 """YoutubeDL redirect handler
220
221 The code is based on HTTPRedirectHandler implementation from CPython [1].
222
223 This redirect handler fixes and improves the logic to better align with RFC7261
224 and what browsers tend to do [2][3]
225
226 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
227 2. https://datatracker.ietf.org/doc/html/rfc7231
228 3. https://github.com/python/cpython/issues/91306
229 """
230
231 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
232
233 def redirect_request(self, req, fp, code, msg, headers, newurl):
234 if code not in (301, 302, 303, 307, 308):
235 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
236
237 new_data = req.data
238
239 # Technically the Cookie header should be in unredirected_hdrs,
240 # however in practice some may set it in normal headers anyway.
241 # We will remove it here to prevent any leaks.
242 remove_headers = ['Cookie']
243
244 new_method = get_redirect_method(req.get_method(), code)
245 # only remove payload if method changed (e.g. POST to GET)
246 if new_method != req.get_method():
247 new_data = None
248 remove_headers.extend(['Content-Length', 'Content-Type'])
249
250 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
251
252 return urllib.request.Request(
253 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
254 unverifiable=True, method=new_method, data=new_data)
255
256
257 class ProxyHandler(urllib.request.BaseHandler):
258 handler_order = 100
259
260 def __init__(self, proxies=None):
261 self.proxies = proxies
262 # Set default handlers
263 for type in ('http', 'https', 'ftp'):
264 setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
265
266 def proxy_open(self, req):
267 proxy = select_proxy(req.get_full_url(), self.proxies)
268 if proxy is None:
269 return
270 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
271 req.add_header('Ytdl-socks-proxy', proxy)
272 # yt-dlp's http/https handlers do wrapping the socket with socks
273 return None
274 return urllib.request.ProxyHandler.proxy_open(
275 self, req, proxy, None)
276
277
278 class PUTRequest(urllib.request.Request):
279 def get_method(self):
280 return 'PUT'
281
282
283 class HEADRequest(urllib.request.Request):
284 def get_method(self):
285 return 'HEAD'
286
287
288 def update_Request(req, url=None, data=None, headers=None, query=None):
289 req_headers = req.headers.copy()
290 req_headers.update(headers or {})
291 req_data = data if data is not None else req.data
292 req_url = update_url_query(url or req.get_full_url(), query)
293 req_get_method = req.get_method()
294 if req_get_method == 'HEAD':
295 req_type = HEADRequest
296 elif req_get_method == 'PUT':
297 req_type = PUTRequest
298 else:
299 req_type = urllib.request.Request
300 new_req = req_type(
301 req_url, data=req_data, headers=req_headers,
302 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
303 if hasattr(req, 'timeout'):
304 new_req.timeout = req.timeout
305 return new_req
306
307
308 class UrllibResponseAdapter(Response):
309 """
310 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
311 """
312
313 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
314 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
315 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
316 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
317 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
318 super().__init__(
319 fp=res, headers=res.headers, url=res.url,
320 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
321
322 def read(self, amt=None):
323 try:
324 return self.fp.read(amt)
325 except Exception as e:
326 handle_response_read_exceptions(e)
327 raise e
328
329
330 def handle_sslerror(e: ssl.SSLError):
331 if not isinstance(e, ssl.SSLError):
332 return
333 if isinstance(e, ssl.SSLCertVerificationError):
334 raise CertificateVerifyError(cause=e) from e
335 raise SSLError(cause=e) from e
336
337
338 def handle_response_read_exceptions(e):
339 if isinstance(e, http.client.IncompleteRead):
340 raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
341 elif isinstance(e, ssl.SSLError):
342 handle_sslerror(e)
343 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
344 # OSErrors raised here should mostly be network related
345 raise TransportError(cause=e) from e
346
347
348 @register_rh
349 class UrllibRH(RequestHandler, InstanceStoreMixin):
350 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
351 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
352 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
353 RH_NAME = 'urllib'
354
355 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
356 super().__init__(**kwargs)
357 self.enable_file_urls = enable_file_urls
358 if self.enable_file_urls:
359 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
360
361 def _check_extensions(self, extensions):
362 super()._check_extensions(extensions)
363 extensions.pop('cookiejar', None)
364 extensions.pop('timeout', None)
365
366 def _create_instance(self, proxies, cookiejar):
367 opener = urllib.request.OpenerDirector()
368 handlers = [
369 ProxyHandler(proxies),
370 HTTPHandler(
371 debuglevel=int(bool(self.verbose)),
372 context=self._make_sslcontext(),
373 source_address=self.source_address),
374 HTTPCookieProcessor(cookiejar),
375 DataHandler(),
376 UnknownHandler(),
377 HTTPDefaultErrorHandler(),
378 FTPHandler(),
379 HTTPErrorProcessor(),
380 RedirectHandler(),
381 ]
382
383 if self.enable_file_urls:
384 handlers.append(FileHandler())
385
386 for handler in handlers:
387 opener.add_handler(handler)
388
389 # Delete the default user-agent header, which would otherwise apply in
390 # cases where our custom HTTP handler doesn't come into play
391 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
392 opener.addheaders = []
393 return opener
394
395 def _send(self, request):
396 headers = self._merge_headers(request.headers)
397 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
398 urllib_req = urllib.request.Request(
399 url=request.url,
400 data=request.data,
401 headers=dict(headers),
402 method=request.method
403 )
404
405 opener = self._get_instance(
406 proxies=request.proxies or self.proxies,
407 cookiejar=request.extensions.get('cookiejar') or self.cookiejar
408 )
409 try:
410 res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
411 except urllib.error.HTTPError as e:
412 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
413 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
414 e._closer.close_called = True
415 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
416 raise # unexpected
417 except urllib.error.URLError as e:
418 cause = e.reason # NOTE: cause may be a string
419
420 # proxy errors
421 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
422 raise ProxyError(cause=e) from e
423
424 handle_response_read_exceptions(cause)
425 raise TransportError(cause=e) from e
426 except (http.client.InvalidURL, ValueError) as e:
427 # Validation errors
428 # http.client.HTTPConnection raises ValueError in some validation cases
429 # such as if request method contains illegal control characters [1]
430 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
431 raise RequestError(cause=e) from e
432 except Exception as e:
433 handle_response_read_exceptions(e)
434 raise # unexpected
435
436 return UrllibResponseAdapter(res)