]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_urllib.py
[rh/urllib] Simplify gzip decoding (#7611)
[yt-dlp.git] / yt_dlp / networking / _urllib.py
1 from __future__ import annotations
2
3 import functools
4 import http.client
5 import io
6 import socket
7 import ssl
8 import urllib.error
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 import zlib
13 from urllib.request import (
14 DataHandler,
15 FileHandler,
16 FTPHandler,
17 HTTPCookieProcessor,
18 HTTPDefaultErrorHandler,
19 HTTPErrorProcessor,
20 UnknownHandler,
21 )
22
23 from ._helper import (
24 InstanceStoreMixin,
25 add_accept_encoding_header,
26 get_redirect_method,
27 make_socks_proxy_opts,
28 select_proxy,
29 )
30 from .common import Features, RequestHandler, Response, register_rh
31 from .exceptions import (
32 CertificateVerifyError,
33 HTTPError,
34 IncompleteRead,
35 ProxyError,
36 RequestError,
37 SSLError,
38 TransportError,
39 )
40 from ..dependencies import brotli
41 from ..socks import ProxyError as SocksProxyError
42 from ..socks import sockssocket
43 from ..utils import update_url_query
44 from ..utils.networking import normalize_url
45
46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
48
49 if brotli:
50 SUPPORTED_ENCODINGS.append('br')
51 CONTENT_DECODE_ERRORS.append(brotli.error)
52
53
54 def _create_http_connection(http_class, source_address, *args, **kwargs):
55 hc = http_class(*args, **kwargs)
56
57 if source_address is not None:
58 # This is to workaround _create_connection() from socket where it will try all
59 # address data from getaddrinfo() including IPv6. This filters the result from
60 # getaddrinfo() based on the source_address value.
61 # This is based on the cpython socket.create_connection() function.
62 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
63 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
64 host, port = address
65 err = None
66 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
67 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
68 ip_addrs = [addr for addr in addrs if addr[0] == af]
69 if addrs and not ip_addrs:
70 ip_version = 'v4' if af == socket.AF_INET else 'v6'
71 raise OSError(
72 "No remote IP%s addresses available for connect, can't use '%s' as source address"
73 % (ip_version, source_address[0]))
74 for res in ip_addrs:
75 af, socktype, proto, canonname, sa = res
76 sock = None
77 try:
78 sock = socket.socket(af, socktype, proto)
79 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
80 sock.settimeout(timeout)
81 sock.bind(source_address)
82 sock.connect(sa)
83 err = None # Explicitly break reference cycle
84 return sock
85 except OSError as _:
86 err = _
87 if sock is not None:
88 sock.close()
89 if err is not None:
90 raise err
91 else:
92 raise OSError('getaddrinfo returns an empty list')
93 if hasattr(hc, '_create_connection'):
94 hc._create_connection = _create_connection
95 hc.source_address = (source_address, 0)
96
97 return hc
98
99
100 class HTTPHandler(urllib.request.AbstractHTTPHandler):
101 """Handler for HTTP requests and responses.
102
103 This class, when installed with an OpenerDirector, automatically adds
104 the standard headers to every HTTP request and handles gzipped, deflated and
105 brotli responses from web servers.
106
107 Part of this code was copied from:
108
109 http://techknack.net/python-urllib2-handlers/
110
111 Andrew Rowls, the author of that code, agreed to release it to the
112 public domain.
113 """
114
115 def __init__(self, context=None, source_address=None, *args, **kwargs):
116 super().__init__(*args, **kwargs)
117 self._source_address = source_address
118 self._context = context
119
120 @staticmethod
121 def _make_conn_class(base, req):
122 conn_class = base
123 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
124 if socks_proxy:
125 conn_class = make_socks_conn_class(conn_class, socks_proxy)
126 return conn_class
127
128 def http_open(self, req):
129 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
130 return self.do_open(functools.partial(
131 _create_http_connection, conn_class, self._source_address), req)
132
133 def https_open(self, req):
134 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
135 return self.do_open(
136 functools.partial(
137 _create_http_connection, conn_class, self._source_address),
138 req, context=self._context)
139
140 @staticmethod
141 def deflate(data):
142 if not data:
143 return data
144 try:
145 return zlib.decompress(data, -zlib.MAX_WBITS)
146 except zlib.error:
147 return zlib.decompress(data)
148
149 @staticmethod
150 def brotli(data):
151 if not data:
152 return data
153 return brotli.decompress(data)
154
155 @staticmethod
156 def gz(data):
157 # There may be junk added the end of the file
158 # We ignore it by only ever decoding a single gzip payload
159 return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
160
161 def http_request(self, req):
162 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
163 # always respected by websites, some tend to give out URLs with non percent-encoded
164 # non-ASCII characters (see telemb.py, ard.py [#3412])
165 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
166 # To work around aforementioned issue we will replace request's original URL with
167 # percent-encoded one
168 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
169 # the code of this workaround has been moved here from YoutubeDL.urlopen()
170 url = req.get_full_url()
171 url_escaped = normalize_url(url)
172
173 # Substitute URL if any change after escaping
174 if url != url_escaped:
175 req = update_Request(req, url=url_escaped)
176
177 return super().do_request_(req)
178
179 def http_response(self, req, resp):
180 old_resp = resp
181
182 # Content-Encoding header lists the encodings in order that they were applied [1].
183 # To decompress, we simply do the reverse.
184 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
185 decoded_response = None
186 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
187 if encoding == 'gzip':
188 decoded_response = self.gz(decoded_response or resp.read())
189 elif encoding == 'deflate':
190 decoded_response = self.deflate(decoded_response or resp.read())
191 elif encoding == 'br' and brotli:
192 decoded_response = self.brotli(decoded_response or resp.read())
193
194 if decoded_response is not None:
195 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
196 resp.msg = old_resp.msg
197 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
198 # https://github.com/ytdl-org/youtube-dl/issues/6457).
199 if 300 <= resp.code < 400:
200 location = resp.headers.get('Location')
201 if location:
202 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
203 location = location.encode('iso-8859-1').decode()
204 location_escaped = normalize_url(location)
205 if location != location_escaped:
206 del resp.headers['Location']
207 resp.headers['Location'] = location_escaped
208 return resp
209
210 https_request = http_request
211 https_response = http_response
212
213
214 def make_socks_conn_class(base_class, socks_proxy):
215 assert issubclass(base_class, (
216 http.client.HTTPConnection, http.client.HTTPSConnection))
217
218 proxy_args = make_socks_proxy_opts(socks_proxy)
219
220 class SocksConnection(base_class):
221 def connect(self):
222 self.sock = sockssocket()
223 self.sock.setproxy(**proxy_args)
224 if type(self.timeout) in (int, float): # noqa: E721
225 self.sock.settimeout(self.timeout)
226 self.sock.connect((self.host, self.port))
227
228 if isinstance(self, http.client.HTTPSConnection):
229 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
230
231 return SocksConnection
232
233
234 class RedirectHandler(urllib.request.HTTPRedirectHandler):
235 """YoutubeDL redirect handler
236
237 The code is based on HTTPRedirectHandler implementation from CPython [1].
238
239 This redirect handler fixes and improves the logic to better align with RFC7261
240 and what browsers tend to do [2][3]
241
242 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
243 2. https://datatracker.ietf.org/doc/html/rfc7231
244 3. https://github.com/python/cpython/issues/91306
245 """
246
247 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
248
249 def redirect_request(self, req, fp, code, msg, headers, newurl):
250 if code not in (301, 302, 303, 307, 308):
251 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
252
253 new_data = req.data
254
255 # Technically the Cookie header should be in unredirected_hdrs,
256 # however in practice some may set it in normal headers anyway.
257 # We will remove it here to prevent any leaks.
258 remove_headers = ['Cookie']
259
260 new_method = get_redirect_method(req.get_method(), code)
261 # only remove payload if method changed (e.g. POST to GET)
262 if new_method != req.get_method():
263 new_data = None
264 remove_headers.extend(['Content-Length', 'Content-Type'])
265
266 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
267
268 return urllib.request.Request(
269 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
270 unverifiable=True, method=new_method, data=new_data)
271
272
273 class ProxyHandler(urllib.request.BaseHandler):
274 handler_order = 100
275
276 def __init__(self, proxies=None):
277 self.proxies = proxies
278 # Set default handlers
279 for type in ('http', 'https', 'ftp'):
280 setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
281
282 def proxy_open(self, req):
283 proxy = select_proxy(req.get_full_url(), self.proxies)
284 if proxy is None:
285 return
286 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
287 req.add_header('Ytdl-socks-proxy', proxy)
288 # yt-dlp's http/https handlers do wrapping the socket with socks
289 return None
290 return urllib.request.ProxyHandler.proxy_open(
291 self, req, proxy, None)
292
293
294 class PUTRequest(urllib.request.Request):
295 def get_method(self):
296 return 'PUT'
297
298
299 class HEADRequest(urllib.request.Request):
300 def get_method(self):
301 return 'HEAD'
302
303
304 def update_Request(req, url=None, data=None, headers=None, query=None):
305 req_headers = req.headers.copy()
306 req_headers.update(headers or {})
307 req_data = data if data is not None else req.data
308 req_url = update_url_query(url or req.get_full_url(), query)
309 req_get_method = req.get_method()
310 if req_get_method == 'HEAD':
311 req_type = HEADRequest
312 elif req_get_method == 'PUT':
313 req_type = PUTRequest
314 else:
315 req_type = urllib.request.Request
316 new_req = req_type(
317 req_url, data=req_data, headers=req_headers,
318 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
319 if hasattr(req, 'timeout'):
320 new_req.timeout = req.timeout
321 return new_req
322
323
324 class UrllibResponseAdapter(Response):
325 """
326 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
327 """
328
329 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
330 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
331 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
332 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
333 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
334 super().__init__(
335 fp=res, headers=res.headers, url=res.url,
336 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
337
338 def read(self, amt=None):
339 try:
340 return self.fp.read(amt)
341 except Exception as e:
342 handle_response_read_exceptions(e)
343 raise e
344
345
346 def handle_sslerror(e: ssl.SSLError):
347 if not isinstance(e, ssl.SSLError):
348 return
349 if isinstance(e, ssl.SSLCertVerificationError):
350 raise CertificateVerifyError(cause=e) from e
351 raise SSLError(cause=e) from e
352
353
354 def handle_response_read_exceptions(e):
355 if isinstance(e, http.client.IncompleteRead):
356 raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
357 elif isinstance(e, ssl.SSLError):
358 handle_sslerror(e)
359 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
360 # OSErrors raised here should mostly be network related
361 raise TransportError(cause=e) from e
362
363
364 @register_rh
365 class UrllibRH(RequestHandler, InstanceStoreMixin):
366 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
367 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
368 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
369 RH_NAME = 'urllib'
370
371 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
372 super().__init__(**kwargs)
373 self.enable_file_urls = enable_file_urls
374 if self.enable_file_urls:
375 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
376
377 def _check_extensions(self, extensions):
378 super()._check_extensions(extensions)
379 extensions.pop('cookiejar', None)
380 extensions.pop('timeout', None)
381
382 def _create_instance(self, proxies, cookiejar):
383 opener = urllib.request.OpenerDirector()
384 handlers = [
385 ProxyHandler(proxies),
386 HTTPHandler(
387 debuglevel=int(bool(self.verbose)),
388 context=self._make_sslcontext(),
389 source_address=self.source_address),
390 HTTPCookieProcessor(cookiejar),
391 DataHandler(),
392 UnknownHandler(),
393 HTTPDefaultErrorHandler(),
394 FTPHandler(),
395 HTTPErrorProcessor(),
396 RedirectHandler(),
397 ]
398
399 if self.enable_file_urls:
400 handlers.append(FileHandler())
401
402 for handler in handlers:
403 opener.add_handler(handler)
404
405 # Delete the default user-agent header, which would otherwise apply in
406 # cases where our custom HTTP handler doesn't come into play
407 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
408 opener.addheaders = []
409 return opener
410
411 def _send(self, request):
412 headers = self._merge_headers(request.headers)
413 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
414 urllib_req = urllib.request.Request(
415 url=request.url,
416 data=request.data,
417 headers=dict(headers),
418 method=request.method
419 )
420
421 opener = self._get_instance(
422 proxies=request.proxies or self.proxies,
423 cookiejar=request.extensions.get('cookiejar') or self.cookiejar
424 )
425 try:
426 res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
427 except urllib.error.HTTPError as e:
428 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
429 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
430 e._closer.file = None
431 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
432 raise # unexpected
433 except urllib.error.URLError as e:
434 cause = e.reason # NOTE: cause may be a string
435
436 # proxy errors
437 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
438 raise ProxyError(cause=e) from e
439
440 handle_response_read_exceptions(cause)
441 raise TransportError(cause=e) from e
442 except (http.client.InvalidURL, ValueError) as e:
443 # Validation errors
444 # http.client.HTTPConnection raises ValueError in some validation cases
445 # such as if request method contains illegal control characters [1]
446 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
447 raise RequestError(cause=e) from e
448 except Exception as e:
449 handle_response_read_exceptions(e)
450 raise # unexpected
451
452 return UrllibResponseAdapter(res)