]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_urllib.py
ff3a22c8c18809163c2454dfb752edabe086b07d
[yt-dlp.git] / yt_dlp / networking / _urllib.py
1 from __future__ import annotations
2
3 import functools
4 import gzip
5 import http.client
6 import io
7 import socket
8 import ssl
9 import urllib.error
10 import urllib.parse
11 import urllib.request
12 import urllib.response
13 import zlib
14 from urllib.request import (
15 DataHandler,
16 FileHandler,
17 FTPHandler,
18 HTTPCookieProcessor,
19 HTTPDefaultErrorHandler,
20 HTTPErrorProcessor,
21 UnknownHandler,
22 )
23
24 from ._helper import (
25 InstanceStoreMixin,
26 add_accept_encoding_header,
27 get_redirect_method,
28 make_socks_proxy_opts,
29 select_proxy,
30 )
31 from .common import Features, RequestHandler, Response, register_rh
32 from .exceptions import (
33 CertificateVerifyError,
34 HTTPError,
35 IncompleteRead,
36 ProxyError,
37 RequestError,
38 SSLError,
39 TransportError,
40 )
41 from ..dependencies import brotli
42 from ..socks import ProxyError as SocksProxyError
43 from ..socks import sockssocket
44 from ..utils import escape_url, update_url_query
45
46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
48
49 if brotli:
50 SUPPORTED_ENCODINGS.append('br')
51 CONTENT_DECODE_ERRORS.append(brotli.error)
52
53
54 def _create_http_connection(http_class, source_address, *args, **kwargs):
55 hc = http_class(*args, **kwargs)
56
57 if source_address is not None:
58 # This is to workaround _create_connection() from socket where it will try all
59 # address data from getaddrinfo() including IPv6. This filters the result from
60 # getaddrinfo() based on the source_address value.
61 # This is based on the cpython socket.create_connection() function.
62 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
63 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
64 host, port = address
65 err = None
66 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
67 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
68 ip_addrs = [addr for addr in addrs if addr[0] == af]
69 if addrs and not ip_addrs:
70 ip_version = 'v4' if af == socket.AF_INET else 'v6'
71 raise OSError(
72 "No remote IP%s addresses available for connect, can't use '%s' as source address"
73 % (ip_version, source_address[0]))
74 for res in ip_addrs:
75 af, socktype, proto, canonname, sa = res
76 sock = None
77 try:
78 sock = socket.socket(af, socktype, proto)
79 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
80 sock.settimeout(timeout)
81 sock.bind(source_address)
82 sock.connect(sa)
83 err = None # Explicitly break reference cycle
84 return sock
85 except OSError as _:
86 err = _
87 if sock is not None:
88 sock.close()
89 if err is not None:
90 raise err
91 else:
92 raise OSError('getaddrinfo returns an empty list')
93 if hasattr(hc, '_create_connection'):
94 hc._create_connection = _create_connection
95 hc.source_address = (source_address, 0)
96
97 return hc
98
99
100 class HTTPHandler(urllib.request.AbstractHTTPHandler):
101 """Handler for HTTP requests and responses.
102
103 This class, when installed with an OpenerDirector, automatically adds
104 the standard headers to every HTTP request and handles gzipped, deflated and
105 brotli responses from web servers.
106
107 Part of this code was copied from:
108
109 http://techknack.net/python-urllib2-handlers/
110
111 Andrew Rowls, the author of that code, agreed to release it to the
112 public domain.
113 """
114
115 def __init__(self, context=None, source_address=None, *args, **kwargs):
116 super().__init__(*args, **kwargs)
117 self._source_address = source_address
118 self._context = context
119
120 @staticmethod
121 def _make_conn_class(base, req):
122 conn_class = base
123 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
124 if socks_proxy:
125 conn_class = make_socks_conn_class(conn_class, socks_proxy)
126 return conn_class
127
128 def http_open(self, req):
129 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
130 return self.do_open(functools.partial(
131 _create_http_connection, conn_class, self._source_address), req)
132
133 def https_open(self, req):
134 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
135 return self.do_open(
136 functools.partial(
137 _create_http_connection, conn_class, self._source_address),
138 req, context=self._context)
139
140 @staticmethod
141 def deflate(data):
142 if not data:
143 return data
144 try:
145 return zlib.decompress(data, -zlib.MAX_WBITS)
146 except zlib.error:
147 return zlib.decompress(data)
148
149 @staticmethod
150 def brotli(data):
151 if not data:
152 return data
153 return brotli.decompress(data)
154
155 @staticmethod
156 def gz(data):
157 gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
158 try:
159 return gz.read()
160 except OSError as original_oserror:
161 # There may be junk add the end of the file
162 # See http://stackoverflow.com/q/4928560/35070 for details
163 for i in range(1, 1024):
164 try:
165 gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
166 return gz.read()
167 except OSError:
168 continue
169 else:
170 raise original_oserror
171
172 def http_request(self, req):
173 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
174 # always respected by websites, some tend to give out URLs with non percent-encoded
175 # non-ASCII characters (see telemb.py, ard.py [#3412])
176 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
177 # To work around aforementioned issue we will replace request's original URL with
178 # percent-encoded one
179 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
180 # the code of this workaround has been moved here from YoutubeDL.urlopen()
181 url = req.get_full_url()
182 url_escaped = escape_url(url)
183
184 # Substitute URL if any change after escaping
185 if url != url_escaped:
186 req = update_Request(req, url=url_escaped)
187
188 return super().do_request_(req)
189
190 def http_response(self, req, resp):
191 old_resp = resp
192
193 # Content-Encoding header lists the encodings in order that they were applied [1].
194 # To decompress, we simply do the reverse.
195 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
196 decoded_response = None
197 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
198 if encoding == 'gzip':
199 decoded_response = self.gz(decoded_response or resp.read())
200 elif encoding == 'deflate':
201 decoded_response = self.deflate(decoded_response or resp.read())
202 elif encoding == 'br' and brotli:
203 decoded_response = self.brotli(decoded_response or resp.read())
204
205 if decoded_response is not None:
206 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
207 resp.msg = old_resp.msg
208 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
209 # https://github.com/ytdl-org/youtube-dl/issues/6457).
210 if 300 <= resp.code < 400:
211 location = resp.headers.get('Location')
212 if location:
213 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
214 location = location.encode('iso-8859-1').decode()
215 location_escaped = escape_url(location)
216 if location != location_escaped:
217 del resp.headers['Location']
218 resp.headers['Location'] = location_escaped
219 return resp
220
221 https_request = http_request
222 https_response = http_response
223
224
225 def make_socks_conn_class(base_class, socks_proxy):
226 assert issubclass(base_class, (
227 http.client.HTTPConnection, http.client.HTTPSConnection))
228
229 proxy_args = make_socks_proxy_opts(socks_proxy)
230
231 class SocksConnection(base_class):
232 def connect(self):
233 self.sock = sockssocket()
234 self.sock.setproxy(**proxy_args)
235 if type(self.timeout) in (int, float): # noqa: E721
236 self.sock.settimeout(self.timeout)
237 self.sock.connect((self.host, self.port))
238
239 if isinstance(self, http.client.HTTPSConnection):
240 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
241
242 return SocksConnection
243
244
245 class RedirectHandler(urllib.request.HTTPRedirectHandler):
246 """YoutubeDL redirect handler
247
248 The code is based on HTTPRedirectHandler implementation from CPython [1].
249
250 This redirect handler fixes and improves the logic to better align with RFC7261
251 and what browsers tend to do [2][3]
252
253 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
254 2. https://datatracker.ietf.org/doc/html/rfc7231
255 3. https://github.com/python/cpython/issues/91306
256 """
257
258 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
259
260 def redirect_request(self, req, fp, code, msg, headers, newurl):
261 if code not in (301, 302, 303, 307, 308):
262 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
263
264 new_data = req.data
265
266 # Technically the Cookie header should be in unredirected_hdrs,
267 # however in practice some may set it in normal headers anyway.
268 # We will remove it here to prevent any leaks.
269 remove_headers = ['Cookie']
270
271 new_method = get_redirect_method(req.get_method(), code)
272 # only remove payload if method changed (e.g. POST to GET)
273 if new_method != req.get_method():
274 new_data = None
275 remove_headers.extend(['Content-Length', 'Content-Type'])
276
277 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
278
279 return urllib.request.Request(
280 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
281 unverifiable=True, method=new_method, data=new_data)
282
283
284 class ProxyHandler(urllib.request.BaseHandler):
285 handler_order = 100
286
287 def __init__(self, proxies=None):
288 self.proxies = proxies
289 # Set default handlers
290 for type in ('http', 'https', 'ftp'):
291 setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
292
293 def proxy_open(self, req):
294 proxy = select_proxy(req.get_full_url(), self.proxies)
295 if proxy is None:
296 return
297 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
298 req.add_header('Ytdl-socks-proxy', proxy)
299 # yt-dlp's http/https handlers do wrapping the socket with socks
300 return None
301 return urllib.request.ProxyHandler.proxy_open(
302 self, req, proxy, None)
303
304
305 class PUTRequest(urllib.request.Request):
306 def get_method(self):
307 return 'PUT'
308
309
310 class HEADRequest(urllib.request.Request):
311 def get_method(self):
312 return 'HEAD'
313
314
315 def update_Request(req, url=None, data=None, headers=None, query=None):
316 req_headers = req.headers.copy()
317 req_headers.update(headers or {})
318 req_data = data if data is not None else req.data
319 req_url = update_url_query(url or req.get_full_url(), query)
320 req_get_method = req.get_method()
321 if req_get_method == 'HEAD':
322 req_type = HEADRequest
323 elif req_get_method == 'PUT':
324 req_type = PUTRequest
325 else:
326 req_type = urllib.request.Request
327 new_req = req_type(
328 req_url, data=req_data, headers=req_headers,
329 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
330 if hasattr(req, 'timeout'):
331 new_req.timeout = req.timeout
332 return new_req
333
334
335 class UrllibResponseAdapter(Response):
336 """
337 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
338 """
339
340 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
341 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
342 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
343 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
344 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
345 super().__init__(
346 fp=res, headers=res.headers, url=res.url,
347 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
348
349 def read(self, amt=None):
350 try:
351 return self.fp.read(amt)
352 except Exception as e:
353 handle_response_read_exceptions(e)
354 raise e
355
356
357 def handle_sslerror(e: ssl.SSLError):
358 if not isinstance(e, ssl.SSLError):
359 return
360 if isinstance(e, ssl.SSLCertVerificationError):
361 raise CertificateVerifyError(cause=e) from e
362 raise SSLError(cause=e) from e
363
364
365 def handle_response_read_exceptions(e):
366 if isinstance(e, http.client.IncompleteRead):
367 raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
368 elif isinstance(e, ssl.SSLError):
369 handle_sslerror(e)
370 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
371 # OSErrors raised here should mostly be network related
372 raise TransportError(cause=e) from e
373
374
375 @register_rh
376 class UrllibRH(RequestHandler, InstanceStoreMixin):
377 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
378 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
379 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
380 RH_NAME = 'urllib'
381
382 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
383 super().__init__(**kwargs)
384 self.enable_file_urls = enable_file_urls
385 if self.enable_file_urls:
386 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
387
388 def _create_instance(self, proxies, cookiejar):
389 opener = urllib.request.OpenerDirector()
390 handlers = [
391 ProxyHandler(proxies),
392 HTTPHandler(
393 debuglevel=int(bool(self.verbose)),
394 context=self._make_sslcontext(),
395 source_address=self.source_address),
396 HTTPCookieProcessor(cookiejar),
397 DataHandler(),
398 UnknownHandler(),
399 HTTPDefaultErrorHandler(),
400 FTPHandler(),
401 HTTPErrorProcessor(),
402 RedirectHandler(),
403 ]
404
405 if self.enable_file_urls:
406 handlers.append(FileHandler())
407
408 for handler in handlers:
409 opener.add_handler(handler)
410
411 # Delete the default user-agent header, which would otherwise apply in
412 # cases where our custom HTTP handler doesn't come into play
413 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
414 opener.addheaders = []
415 return opener
416
417 def _send(self, request):
418 headers = self._merge_headers(request.headers)
419 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
420 urllib_req = urllib.request.Request(
421 url=request.url,
422 data=request.data,
423 headers=dict(headers),
424 method=request.method
425 )
426
427 opener = self._get_instance(
428 proxies=request.proxies or self.proxies,
429 cookiejar=request.extensions.get('cookiejar') or self.cookiejar
430 )
431 try:
432 res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
433 except urllib.error.HTTPError as e:
434 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
435 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
436 e._closer.file = None
437 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
438 raise # unexpected
439 except urllib.error.URLError as e:
440 cause = e.reason # NOTE: cause may be a string
441
442 # proxy errors
443 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
444 raise ProxyError(cause=e) from e
445
446 handle_response_read_exceptions(cause)
447 raise TransportError(cause=e) from e
448 except (http.client.InvalidURL, ValueError) as e:
449 # Validation errors
450 # http.client.HTTPConnection raises ValueError in some validation cases
451 # such as if request method contains illegal control characters [1]
452 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
453 raise RequestError(cause=e) from e
454 except Exception as e:
455 handle_response_read_exceptions(e)
456 raise # unexpected
457
458 return UrllibResponseAdapter(res)