]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_urllib.py
b3e705b8448d7d3732d98c103d52a17bc8fae3b0
[yt-dlp.git] / yt_dlp / networking / _urllib.py
1 from __future__ import annotations
2
3 import functools
4 import http.client
5 import io
6 import socket
7 import ssl
8 import urllib.error
9 import urllib.parse
10 import urllib.request
11 import urllib.response
12 import zlib
13 from urllib.request import (
14 DataHandler,
15 FileHandler,
16 FTPHandler,
17 HTTPCookieProcessor,
18 HTTPDefaultErrorHandler,
19 HTTPErrorProcessor,
20 UnknownHandler,
21 )
22
23 from ._helper import (
24 InstanceStoreMixin,
25 add_accept_encoding_header,
26 get_redirect_method,
27 make_socks_proxy_opts,
28 select_proxy,
29 )
30 from .common import Features, RequestHandler, Response, register_rh
31 from .exceptions import (
32 CertificateVerifyError,
33 HTTPError,
34 IncompleteRead,
35 ProxyError,
36 RequestError,
37 SSLError,
38 TransportError,
39 )
40 from ..dependencies import brotli
41 from ..socks import ProxyError as SocksProxyError
42 from ..socks import sockssocket
43 from ..utils import update_url_query
44 from ..utils.networking import normalize_url
45
46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
48
49 if brotli:
50 SUPPORTED_ENCODINGS.append('br')
51 CONTENT_DECODE_ERRORS.append(brotli.error)
52
53
54 def _create_http_connection(http_class, source_address, *args, **kwargs):
55 hc = http_class(*args, **kwargs)
56
57 if source_address is not None:
58 # This is to workaround _create_connection() from socket where it will try all
59 # address data from getaddrinfo() including IPv6. This filters the result from
60 # getaddrinfo() based on the source_address value.
61 # This is based on the cpython socket.create_connection() function.
62 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
63 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
64 host, port = address
65 err = None
66 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
67 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
68 ip_addrs = [addr for addr in addrs if addr[0] == af]
69 if addrs and not ip_addrs:
70 ip_version = 'v4' if af == socket.AF_INET else 'v6'
71 raise OSError(
72 "No remote IP%s addresses available for connect, can't use '%s' as source address"
73 % (ip_version, source_address[0]))
74 for res in ip_addrs:
75 af, socktype, proto, canonname, sa = res
76 sock = None
77 try:
78 sock = socket.socket(af, socktype, proto)
79 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
80 sock.settimeout(timeout)
81 sock.bind(source_address)
82 sock.connect(sa)
83 err = None # Explicitly break reference cycle
84 return sock
85 except OSError as _:
86 err = _
87 if sock is not None:
88 sock.close()
89 if err is not None:
90 raise err
91 else:
92 raise OSError('getaddrinfo returns an empty list')
93 if hasattr(hc, '_create_connection'):
94 hc._create_connection = _create_connection
95 hc.source_address = (source_address, 0)
96
97 return hc
98
99
100 class HTTPHandler(urllib.request.AbstractHTTPHandler):
101 """Handler for HTTP requests and responses.
102
103 This class, when installed with an OpenerDirector, automatically adds
104 the standard headers to every HTTP request and handles gzipped, deflated and
105 brotli responses from web servers.
106
107 Part of this code was copied from:
108
109 http://techknack.net/python-urllib2-handlers/
110
111 Andrew Rowls, the author of that code, agreed to release it to the
112 public domain.
113 """
114
115 def __init__(self, context=None, source_address=None, *args, **kwargs):
116 super().__init__(*args, **kwargs)
117 self._source_address = source_address
118 self._context = context
119
120 @staticmethod
121 def _make_conn_class(base, req):
122 conn_class = base
123 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
124 if socks_proxy:
125 conn_class = make_socks_conn_class(conn_class, socks_proxy)
126 return conn_class
127
128 def http_open(self, req):
129 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
130 return self.do_open(functools.partial(
131 _create_http_connection, conn_class, self._source_address), req)
132
133 def https_open(self, req):
134 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
135 return self.do_open(
136 functools.partial(
137 _create_http_connection, conn_class, self._source_address),
138 req, context=self._context)
139
140 @staticmethod
141 def deflate(data):
142 if not data:
143 return data
144 try:
145 return zlib.decompress(data, -zlib.MAX_WBITS)
146 except zlib.error:
147 return zlib.decompress(data)
148
149 @staticmethod
150 def brotli(data):
151 if not data:
152 return data
153 return brotli.decompress(data)
154
155 @staticmethod
156 def gz(data):
157 # There may be junk added the end of the file
158 # We ignore it by only ever decoding a single gzip payload
159 if not data:
160 return data
161 return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
162
163 def http_request(self, req):
164 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
165 # always respected by websites, some tend to give out URLs with non percent-encoded
166 # non-ASCII characters (see telemb.py, ard.py [#3412])
167 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
168 # To work around aforementioned issue we will replace request's original URL with
169 # percent-encoded one
170 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
171 # the code of this workaround has been moved here from YoutubeDL.urlopen()
172 url = req.get_full_url()
173 url_escaped = normalize_url(url)
174
175 # Substitute URL if any change after escaping
176 if url != url_escaped:
177 req = update_Request(req, url=url_escaped)
178
179 return super().do_request_(req)
180
181 def http_response(self, req, resp):
182 old_resp = resp
183
184 # Content-Encoding header lists the encodings in order that they were applied [1].
185 # To decompress, we simply do the reverse.
186 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
187 decoded_response = None
188 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
189 if encoding == 'gzip':
190 decoded_response = self.gz(decoded_response or resp.read())
191 elif encoding == 'deflate':
192 decoded_response = self.deflate(decoded_response or resp.read())
193 elif encoding == 'br' and brotli:
194 decoded_response = self.brotli(decoded_response or resp.read())
195
196 if decoded_response is not None:
197 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
198 resp.msg = old_resp.msg
199 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
200 # https://github.com/ytdl-org/youtube-dl/issues/6457).
201 if 300 <= resp.code < 400:
202 location = resp.headers.get('Location')
203 if location:
204 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
205 location = location.encode('iso-8859-1').decode()
206 location_escaped = normalize_url(location)
207 if location != location_escaped:
208 del resp.headers['Location']
209 resp.headers['Location'] = location_escaped
210 return resp
211
212 https_request = http_request
213 https_response = http_response
214
215
216 def make_socks_conn_class(base_class, socks_proxy):
217 assert issubclass(base_class, (
218 http.client.HTTPConnection, http.client.HTTPSConnection))
219
220 proxy_args = make_socks_proxy_opts(socks_proxy)
221
222 class SocksConnection(base_class):
223 def connect(self):
224 self.sock = sockssocket()
225 self.sock.setproxy(**proxy_args)
226 if type(self.timeout) in (int, float): # noqa: E721
227 self.sock.settimeout(self.timeout)
228 self.sock.connect((self.host, self.port))
229
230 if isinstance(self, http.client.HTTPSConnection):
231 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
232
233 return SocksConnection
234
235
236 class RedirectHandler(urllib.request.HTTPRedirectHandler):
237 """YoutubeDL redirect handler
238
239 The code is based on HTTPRedirectHandler implementation from CPython [1].
240
241 This redirect handler fixes and improves the logic to better align with RFC7261
242 and what browsers tend to do [2][3]
243
244 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
245 2. https://datatracker.ietf.org/doc/html/rfc7231
246 3. https://github.com/python/cpython/issues/91306
247 """
248
249 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
250
251 def redirect_request(self, req, fp, code, msg, headers, newurl):
252 if code not in (301, 302, 303, 307, 308):
253 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
254
255 new_data = req.data
256
257 # Technically the Cookie header should be in unredirected_hdrs,
258 # however in practice some may set it in normal headers anyway.
259 # We will remove it here to prevent any leaks.
260 remove_headers = ['Cookie']
261
262 new_method = get_redirect_method(req.get_method(), code)
263 # only remove payload if method changed (e.g. POST to GET)
264 if new_method != req.get_method():
265 new_data = None
266 remove_headers.extend(['Content-Length', 'Content-Type'])
267
268 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
269
270 return urllib.request.Request(
271 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
272 unverifiable=True, method=new_method, data=new_data)
273
274
275 class ProxyHandler(urllib.request.BaseHandler):
276 handler_order = 100
277
278 def __init__(self, proxies=None):
279 self.proxies = proxies
280 # Set default handlers
281 for type in ('http', 'https', 'ftp'):
282 setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
283
284 def proxy_open(self, req):
285 proxy = select_proxy(req.get_full_url(), self.proxies)
286 if proxy is None:
287 return
288 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
289 req.add_header('Ytdl-socks-proxy', proxy)
290 # yt-dlp's http/https handlers do wrapping the socket with socks
291 return None
292 return urllib.request.ProxyHandler.proxy_open(
293 self, req, proxy, None)
294
295
296 class PUTRequest(urllib.request.Request):
297 def get_method(self):
298 return 'PUT'
299
300
301 class HEADRequest(urllib.request.Request):
302 def get_method(self):
303 return 'HEAD'
304
305
306 def update_Request(req, url=None, data=None, headers=None, query=None):
307 req_headers = req.headers.copy()
308 req_headers.update(headers or {})
309 req_data = data if data is not None else req.data
310 req_url = update_url_query(url or req.get_full_url(), query)
311 req_get_method = req.get_method()
312 if req_get_method == 'HEAD':
313 req_type = HEADRequest
314 elif req_get_method == 'PUT':
315 req_type = PUTRequest
316 else:
317 req_type = urllib.request.Request
318 new_req = req_type(
319 req_url, data=req_data, headers=req_headers,
320 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
321 if hasattr(req, 'timeout'):
322 new_req.timeout = req.timeout
323 return new_req
324
325
326 class UrllibResponseAdapter(Response):
327 """
328 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
329 """
330
331 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
332 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
333 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
334 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
335 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
336 super().__init__(
337 fp=res, headers=res.headers, url=res.url,
338 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
339
340 def read(self, amt=None):
341 try:
342 return self.fp.read(amt)
343 except Exception as e:
344 handle_response_read_exceptions(e)
345 raise e
346
347
348 def handle_sslerror(e: ssl.SSLError):
349 if not isinstance(e, ssl.SSLError):
350 return
351 if isinstance(e, ssl.SSLCertVerificationError):
352 raise CertificateVerifyError(cause=e) from e
353 raise SSLError(cause=e) from e
354
355
356 def handle_response_read_exceptions(e):
357 if isinstance(e, http.client.IncompleteRead):
358 raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
359 elif isinstance(e, ssl.SSLError):
360 handle_sslerror(e)
361 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
362 # OSErrors raised here should mostly be network related
363 raise TransportError(cause=e) from e
364
365
366 @register_rh
367 class UrllibRH(RequestHandler, InstanceStoreMixin):
368 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
369 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
370 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
371 RH_NAME = 'urllib'
372
373 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
374 super().__init__(**kwargs)
375 self.enable_file_urls = enable_file_urls
376 if self.enable_file_urls:
377 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
378
379 def _check_extensions(self, extensions):
380 super()._check_extensions(extensions)
381 extensions.pop('cookiejar', None)
382 extensions.pop('timeout', None)
383
384 def _create_instance(self, proxies, cookiejar):
385 opener = urllib.request.OpenerDirector()
386 handlers = [
387 ProxyHandler(proxies),
388 HTTPHandler(
389 debuglevel=int(bool(self.verbose)),
390 context=self._make_sslcontext(),
391 source_address=self.source_address),
392 HTTPCookieProcessor(cookiejar),
393 DataHandler(),
394 UnknownHandler(),
395 HTTPDefaultErrorHandler(),
396 FTPHandler(),
397 HTTPErrorProcessor(),
398 RedirectHandler(),
399 ]
400
401 if self.enable_file_urls:
402 handlers.append(FileHandler())
403
404 for handler in handlers:
405 opener.add_handler(handler)
406
407 # Delete the default user-agent header, which would otherwise apply in
408 # cases where our custom HTTP handler doesn't come into play
409 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
410 opener.addheaders = []
411 return opener
412
413 def _send(self, request):
414 headers = self._merge_headers(request.headers)
415 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
416 urllib_req = urllib.request.Request(
417 url=request.url,
418 data=request.data,
419 headers=dict(headers),
420 method=request.method
421 )
422
423 opener = self._get_instance(
424 proxies=request.proxies or self.proxies,
425 cookiejar=request.extensions.get('cookiejar') or self.cookiejar
426 )
427 try:
428 res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
429 except urllib.error.HTTPError as e:
430 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
431 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
432 e._closer.file = None
433 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
434 raise # unexpected
435 except urllib.error.URLError as e:
436 cause = e.reason # NOTE: cause may be a string
437
438 # proxy errors
439 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
440 raise ProxyError(cause=e) from e
441
442 handle_response_read_exceptions(cause)
443 raise TransportError(cause=e) from e
444 except (http.client.InvalidURL, ValueError) as e:
445 # Validation errors
446 # http.client.HTTPConnection raises ValueError in some validation cases
447 # such as if request method contains illegal control characters [1]
448 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
449 raise RequestError(cause=e) from e
450 except Exception as e:
451 handle_response_read_exceptions(e)
452 raise # unexpected
453
454 return UrllibResponseAdapter(res)