]> jfr.im git - yt-dlp.git/blame - yt_dlp/networking/_urllib.py
[core] Fix support for upcoming Python 3.12 (#8130)
[yt-dlp.git] / yt_dlp / networking / _urllib.py
CommitLineData
227bf1a3 1from __future__ import annotations
2
c365dba8 3import functools
c365dba8 4import http.client
5import io
6import socket
7import ssl
8import urllib.error
9import urllib.parse
10import urllib.request
11import urllib.response
12import zlib
227bf1a3 13from urllib.request import (
14 DataHandler,
15 FileHandler,
16 FTPHandler,
17 HTTPCookieProcessor,
18 HTTPDefaultErrorHandler,
19 HTTPErrorProcessor,
20 UnknownHandler,
21)
c365dba8 22
23from ._helper import (
227bf1a3 24 InstanceStoreMixin,
c365dba8 25 add_accept_encoding_header,
26 get_redirect_method,
27 make_socks_proxy_opts,
227bf1a3 28 select_proxy,
29)
62b5c94c 30from .common import Features, RequestHandler, Response, register_rh
227bf1a3 31from .exceptions import (
32 CertificateVerifyError,
33 HTTPError,
34 IncompleteRead,
35 ProxyError,
36 RequestError,
37 SSLError,
38 TransportError,
c365dba8 39)
40from ..dependencies import brotli
227bf1a3 41from ..socks import ProxyError as SocksProxyError
c365dba8 42from ..socks import sockssocket
4bf91228 43from ..utils import update_url_query
44from ..utils.networking import normalize_url
c365dba8 45
46SUPPORTED_ENCODINGS = ['gzip', 'deflate']
227bf1a3 47CONTENT_DECODE_ERRORS = [zlib.error, OSError]
c365dba8 48
49if brotli:
50 SUPPORTED_ENCODINGS.append('br')
227bf1a3 51 CONTENT_DECODE_ERRORS.append(brotli.error)
c365dba8 52
53
227bf1a3 54def _create_http_connection(http_class, source_address, *args, **kwargs):
c365dba8 55 hc = http_class(*args, **kwargs)
c365dba8 56
57 if source_address is not None:
58 # This is to workaround _create_connection() from socket where it will try all
59 # address data from getaddrinfo() including IPv6. This filters the result from
60 # getaddrinfo() based on the source_address value.
61 # This is based on the cpython socket.create_connection() function.
62 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
63 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
64 host, port = address
65 err = None
66 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
67 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
68 ip_addrs = [addr for addr in addrs if addr[0] == af]
69 if addrs and not ip_addrs:
70 ip_version = 'v4' if af == socket.AF_INET else 'v6'
71 raise OSError(
72 "No remote IP%s addresses available for connect, can't use '%s' as source address"
73 % (ip_version, source_address[0]))
74 for res in ip_addrs:
75 af, socktype, proto, canonname, sa = res
76 sock = None
77 try:
78 sock = socket.socket(af, socktype, proto)
79 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
80 sock.settimeout(timeout)
81 sock.bind(source_address)
82 sock.connect(sa)
83 err = None # Explicitly break reference cycle
84 return sock
85 except OSError as _:
86 err = _
87 if sock is not None:
88 sock.close()
89 if err is not None:
90 raise err
91 else:
92 raise OSError('getaddrinfo returns an empty list')
93 if hasattr(hc, '_create_connection'):
94 hc._create_connection = _create_connection
95 hc.source_address = (source_address, 0)
96
97 return hc
98
99
227bf1a3 100class HTTPHandler(urllib.request.AbstractHTTPHandler):
c365dba8 101 """Handler for HTTP requests and responses.
102
103 This class, when installed with an OpenerDirector, automatically adds
104 the standard headers to every HTTP request and handles gzipped, deflated and
105 brotli responses from web servers.
106
107 Part of this code was copied from:
108
109 http://techknack.net/python-urllib2-handlers/
110
111 Andrew Rowls, the author of that code, agreed to release it to the
112 public domain.
113 """
114
227bf1a3 115 def __init__(self, context=None, source_address=None, *args, **kwargs):
116 super().__init__(*args, **kwargs)
117 self._source_address = source_address
118 self._context = context
c365dba8 119
227bf1a3 120 @staticmethod
121 def _make_conn_class(base, req):
122 conn_class = base
123 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
c365dba8 124 if socks_proxy:
125 conn_class = make_socks_conn_class(conn_class, socks_proxy)
227bf1a3 126 return conn_class
c365dba8 127
227bf1a3 128 def http_open(self, req):
129 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
c365dba8 130 return self.do_open(functools.partial(
227bf1a3 131 _create_http_connection, conn_class, self._source_address), req)
132
133 def https_open(self, req):
134 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
135 return self.do_open(
136 functools.partial(
137 _create_http_connection, conn_class, self._source_address),
138 req, context=self._context)
c365dba8 139
140 @staticmethod
141 def deflate(data):
142 if not data:
143 return data
144 try:
145 return zlib.decompress(data, -zlib.MAX_WBITS)
146 except zlib.error:
147 return zlib.decompress(data)
148
149 @staticmethod
150 def brotli(data):
151 if not data:
152 return data
153 return brotli.decompress(data)
154
155 @staticmethod
156 def gz(data):
59e92b1f
SS
157 # There may be junk added the end of the file
158 # We ignore it by only ever decoding a single gzip payload
77bff23e
SS
159 if not data:
160 return data
59e92b1f 161 return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
c365dba8 162
163 def http_request(self, req):
164 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
165 # always respected by websites, some tend to give out URLs with non percent-encoded
166 # non-ASCII characters (see telemb.py, ard.py [#3412])
167 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
168 # To work around aforementioned issue we will replace request's original URL with
169 # percent-encoded one
170 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
171 # the code of this workaround has been moved here from YoutubeDL.urlopen()
172 url = req.get_full_url()
4bf91228 173 url_escaped = normalize_url(url)
c365dba8 174
175 # Substitute URL if any change after escaping
176 if url != url_escaped:
177 req = update_Request(req, url=url_escaped)
178
c365dba8 179 return super().do_request_(req)
180
181 def http_response(self, req, resp):
182 old_resp = resp
183
184 # Content-Encoding header lists the encodings in order that they were applied [1].
185 # To decompress, we simply do the reverse.
186 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
187 decoded_response = None
188 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
189 if encoding == 'gzip':
190 decoded_response = self.gz(decoded_response or resp.read())
191 elif encoding == 'deflate':
192 decoded_response = self.deflate(decoded_response or resp.read())
193 elif encoding == 'br' and brotli:
194 decoded_response = self.brotli(decoded_response or resp.read())
195
196 if decoded_response is not None:
197 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
198 resp.msg = old_resp.msg
199 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
200 # https://github.com/ytdl-org/youtube-dl/issues/6457).
201 if 300 <= resp.code < 400:
202 location = resp.headers.get('Location')
203 if location:
204 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
205 location = location.encode('iso-8859-1').decode()
4bf91228 206 location_escaped = normalize_url(location)
c365dba8 207 if location != location_escaped:
208 del resp.headers['Location']
209 resp.headers['Location'] = location_escaped
210 return resp
211
212 https_request = http_request
213 https_response = http_response
214
215
216def make_socks_conn_class(base_class, socks_proxy):
217 assert issubclass(base_class, (
218 http.client.HTTPConnection, http.client.HTTPSConnection))
219
220 proxy_args = make_socks_proxy_opts(socks_proxy)
221
222 class SocksConnection(base_class):
223 def connect(self):
224 self.sock = sockssocket()
225 self.sock.setproxy(**proxy_args)
227bf1a3 226 if type(self.timeout) in (int, float): # noqa: E721
c365dba8 227 self.sock.settimeout(self.timeout)
228 self.sock.connect((self.host, self.port))
229
230 if isinstance(self, http.client.HTTPSConnection):
227bf1a3 231 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
c365dba8 232
233 return SocksConnection
234
235
236class RedirectHandler(urllib.request.HTTPRedirectHandler):
237 """YoutubeDL redirect handler
238
239 The code is based on HTTPRedirectHandler implementation from CPython [1].
240
241 This redirect handler fixes and improves the logic to better align with RFC7261
242 and what browsers tend to do [2][3]
243
244 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
245 2. https://datatracker.ietf.org/doc/html/rfc7231
246 3. https://github.com/python/cpython/issues/91306
247 """
248
249 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
250
251 def redirect_request(self, req, fp, code, msg, headers, newurl):
252 if code not in (301, 302, 303, 307, 308):
253 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
254
255 new_data = req.data
256
257 # Technically the Cookie header should be in unredirected_hdrs,
258 # however in practice some may set it in normal headers anyway.
259 # We will remove it here to prevent any leaks.
260 remove_headers = ['Cookie']
261
262 new_method = get_redirect_method(req.get_method(), code)
263 # only remove payload if method changed (e.g. POST to GET)
264 if new_method != req.get_method():
265 new_data = None
266 remove_headers.extend(['Content-Length', 'Content-Type'])
267
268 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
269
270 return urllib.request.Request(
271 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
272 unverifiable=True, method=new_method, data=new_data)
273
274
227bf1a3 275class ProxyHandler(urllib.request.BaseHandler):
276 handler_order = 100
277
c365dba8 278 def __init__(self, proxies=None):
227bf1a3 279 self.proxies = proxies
c365dba8 280 # Set default handlers
227bf1a3 281 for type in ('http', 'https', 'ftp'):
282 setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
283
284 def proxy_open(self, req):
285 proxy = select_proxy(req.get_full_url(), self.proxies)
286 if proxy is None:
287 return
288 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
c365dba8 289 req.add_header('Ytdl-socks-proxy', proxy)
290 # yt-dlp's http/https handlers do wrapping the socket with socks
291 return None
292 return urllib.request.ProxyHandler.proxy_open(
227bf1a3 293 self, req, proxy, None)
c365dba8 294
295
296class PUTRequest(urllib.request.Request):
297 def get_method(self):
298 return 'PUT'
299
300
301class HEADRequest(urllib.request.Request):
302 def get_method(self):
303 return 'HEAD'
304
305
306def update_Request(req, url=None, data=None, headers=None, query=None):
307 req_headers = req.headers.copy()
308 req_headers.update(headers or {})
71baa490 309 req_data = data if data is not None else req.data
c365dba8 310 req_url = update_url_query(url or req.get_full_url(), query)
311 req_get_method = req.get_method()
312 if req_get_method == 'HEAD':
313 req_type = HEADRequest
314 elif req_get_method == 'PUT':
315 req_type = PUTRequest
316 else:
317 req_type = urllib.request.Request
318 new_req = req_type(
319 req_url, data=req_data, headers=req_headers,
320 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
321 if hasattr(req, 'timeout'):
322 new_req.timeout = req.timeout
323 return new_req
227bf1a3 324
325
326class UrllibResponseAdapter(Response):
327 """
328 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
329 """
330
331 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
332 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
333 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
334 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
335 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
336 super().__init__(
337 fp=res, headers=res.headers, url=res.url,
338 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
339
340 def read(self, amt=None):
341 try:
342 return self.fp.read(amt)
343 except Exception as e:
344 handle_response_read_exceptions(e)
345 raise e
346
347
348def handle_sslerror(e: ssl.SSLError):
349 if not isinstance(e, ssl.SSLError):
350 return
351 if isinstance(e, ssl.SSLCertVerificationError):
352 raise CertificateVerifyError(cause=e) from e
353 raise SSLError(cause=e) from e
354
355
356def handle_response_read_exceptions(e):
357 if isinstance(e, http.client.IncompleteRead):
358 raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
359 elif isinstance(e, ssl.SSLError):
360 handle_sslerror(e)
361 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
362 # OSErrors raised here should mostly be network related
363 raise TransportError(cause=e) from e
364
365
62b5c94c 366@register_rh
227bf1a3 367class UrllibRH(RequestHandler, InstanceStoreMixin):
368 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
369 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
370 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
371 RH_NAME = 'urllib'
372
373 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
374 super().__init__(**kwargs)
375 self.enable_file_urls = enable_file_urls
376 if self.enable_file_urls:
377 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
378
86aea0d3 379 def _check_extensions(self, extensions):
380 super()._check_extensions(extensions)
381 extensions.pop('cookiejar', None)
382 extensions.pop('timeout', None)
383
227bf1a3 384 def _create_instance(self, proxies, cookiejar):
385 opener = urllib.request.OpenerDirector()
386 handlers = [
387 ProxyHandler(proxies),
388 HTTPHandler(
389 debuglevel=int(bool(self.verbose)),
390 context=self._make_sslcontext(),
391 source_address=self.source_address),
392 HTTPCookieProcessor(cookiejar),
393 DataHandler(),
394 UnknownHandler(),
395 HTTPDefaultErrorHandler(),
396 FTPHandler(),
397 HTTPErrorProcessor(),
398 RedirectHandler(),
399 ]
400
401 if self.enable_file_urls:
402 handlers.append(FileHandler())
403
404 for handler in handlers:
405 opener.add_handler(handler)
406
407 # Delete the default user-agent header, which would otherwise apply in
408 # cases where our custom HTTP handler doesn't come into play
409 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
410 opener.addheaders = []
411 return opener
412
413 def _send(self, request):
414 headers = self._merge_headers(request.headers)
415 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
416 urllib_req = urllib.request.Request(
417 url=request.url,
418 data=request.data,
419 headers=dict(headers),
420 method=request.method
421 )
422
423 opener = self._get_instance(
424 proxies=request.proxies or self.proxies,
425 cookiejar=request.extensions.get('cookiejar') or self.cookiejar
426 )
427 try:
428 res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
429 except urllib.error.HTTPError as e:
430 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
431 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
836e06d2 432 e._closer.close_called = True
227bf1a3 433 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
434 raise # unexpected
435 except urllib.error.URLError as e:
436 cause = e.reason # NOTE: cause may be a string
437
438 # proxy errors
439 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
440 raise ProxyError(cause=e) from e
441
442 handle_response_read_exceptions(cause)
443 raise TransportError(cause=e) from e
444 except (http.client.InvalidURL, ValueError) as e:
445 # Validation errors
446 # http.client.HTTPConnection raises ValueError in some validation cases
447 # such as if request method contains illegal control characters [1]
448 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
449 raise RequestError(cause=e) from e
450 except Exception as e:
451 handle_response_read_exceptions(e)
452 raise # unexpected
453
454 return UrllibResponseAdapter(res)