]>
Commit | Line | Data |
---|---|---|
227bf1a3 | 1 | from __future__ import annotations |
2 | ||
c365dba8 | 3 | import functools |
c365dba8 | 4 | import http.client |
5 | import io | |
6 | import socket | |
7 | import ssl | |
8 | import urllib.error | |
9 | import urllib.parse | |
10 | import urllib.request | |
11 | import urllib.response | |
12 | import zlib | |
227bf1a3 | 13 | from urllib.request import ( |
14 | DataHandler, | |
15 | FileHandler, | |
16 | FTPHandler, | |
17 | HTTPCookieProcessor, | |
18 | HTTPDefaultErrorHandler, | |
19 | HTTPErrorProcessor, | |
20 | UnknownHandler, | |
21 | ) | |
c365dba8 | 22 | |
23 | from ._helper import ( | |
227bf1a3 | 24 | InstanceStoreMixin, |
c365dba8 | 25 | add_accept_encoding_header, |
26 | get_redirect_method, | |
27 | make_socks_proxy_opts, | |
227bf1a3 | 28 | select_proxy, |
29 | ) | |
62b5c94c | 30 | from .common import Features, RequestHandler, Response, register_rh |
227bf1a3 | 31 | from .exceptions import ( |
32 | CertificateVerifyError, | |
33 | HTTPError, | |
34 | IncompleteRead, | |
35 | ProxyError, | |
36 | RequestError, | |
37 | SSLError, | |
38 | TransportError, | |
c365dba8 | 39 | ) |
40 | from ..dependencies import brotli | |
227bf1a3 | 41 | from ..socks import ProxyError as SocksProxyError |
c365dba8 | 42 | from ..socks import sockssocket |
4bf91228 | 43 | from ..utils import update_url_query |
44 | from ..utils.networking import normalize_url | |
c365dba8 | 45 | |
46 | SUPPORTED_ENCODINGS = ['gzip', 'deflate'] | |
227bf1a3 | 47 | CONTENT_DECODE_ERRORS = [zlib.error, OSError] |
c365dba8 | 48 | |
49 | if brotli: | |
50 | SUPPORTED_ENCODINGS.append('br') | |
227bf1a3 | 51 | CONTENT_DECODE_ERRORS.append(brotli.error) |
c365dba8 | 52 | |
53 | ||
227bf1a3 | 54 | def _create_http_connection(http_class, source_address, *args, **kwargs): |
c365dba8 | 55 | hc = http_class(*args, **kwargs) |
c365dba8 | 56 | |
57 | if source_address is not None: | |
58 | # This is to workaround _create_connection() from socket where it will try all | |
59 | # address data from getaddrinfo() including IPv6. This filters the result from | |
60 | # getaddrinfo() based on the source_address value. | |
61 | # This is based on the cpython socket.create_connection() function. | |
62 | # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 | |
63 | def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): | |
64 | host, port = address | |
65 | err = None | |
66 | addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) | |
67 | af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 | |
68 | ip_addrs = [addr for addr in addrs if addr[0] == af] | |
69 | if addrs and not ip_addrs: | |
70 | ip_version = 'v4' if af == socket.AF_INET else 'v6' | |
71 | raise OSError( | |
72 | "No remote IP%s addresses available for connect, can't use '%s' as source address" | |
73 | % (ip_version, source_address[0])) | |
74 | for res in ip_addrs: | |
75 | af, socktype, proto, canonname, sa = res | |
76 | sock = None | |
77 | try: | |
78 | sock = socket.socket(af, socktype, proto) | |
79 | if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: | |
80 | sock.settimeout(timeout) | |
81 | sock.bind(source_address) | |
82 | sock.connect(sa) | |
83 | err = None # Explicitly break reference cycle | |
84 | return sock | |
85 | except OSError as _: | |
86 | err = _ | |
87 | if sock is not None: | |
88 | sock.close() | |
89 | if err is not None: | |
90 | raise err | |
91 | else: | |
92 | raise OSError('getaddrinfo returns an empty list') | |
93 | if hasattr(hc, '_create_connection'): | |
94 | hc._create_connection = _create_connection | |
95 | hc.source_address = (source_address, 0) | |
96 | ||
97 | return hc | |
98 | ||
99 | ||
227bf1a3 | 100 | class HTTPHandler(urllib.request.AbstractHTTPHandler): |
c365dba8 | 101 | """Handler for HTTP requests and responses. |
102 | ||
103 | This class, when installed with an OpenerDirector, automatically adds | |
104 | the standard headers to every HTTP request and handles gzipped, deflated and | |
105 | brotli responses from web servers. | |
106 | ||
107 | Part of this code was copied from: | |
108 | ||
109 | http://techknack.net/python-urllib2-handlers/ | |
110 | ||
111 | Andrew Rowls, the author of that code, agreed to release it to the | |
112 | public domain. | |
113 | """ | |
114 | ||
227bf1a3 | 115 | def __init__(self, context=None, source_address=None, *args, **kwargs): |
116 | super().__init__(*args, **kwargs) | |
117 | self._source_address = source_address | |
118 | self._context = context | |
c365dba8 | 119 | |
227bf1a3 | 120 | @staticmethod |
121 | def _make_conn_class(base, req): | |
122 | conn_class = base | |
123 | socks_proxy = req.headers.pop('Ytdl-socks-proxy', None) | |
c365dba8 | 124 | if socks_proxy: |
125 | conn_class = make_socks_conn_class(conn_class, socks_proxy) | |
227bf1a3 | 126 | return conn_class |
c365dba8 | 127 | |
227bf1a3 | 128 | def http_open(self, req): |
129 | conn_class = self._make_conn_class(http.client.HTTPConnection, req) | |
c365dba8 | 130 | return self.do_open(functools.partial( |
227bf1a3 | 131 | _create_http_connection, conn_class, self._source_address), req) |
132 | ||
133 | def https_open(self, req): | |
134 | conn_class = self._make_conn_class(http.client.HTTPSConnection, req) | |
135 | return self.do_open( | |
136 | functools.partial( | |
137 | _create_http_connection, conn_class, self._source_address), | |
138 | req, context=self._context) | |
c365dba8 | 139 | |
140 | @staticmethod | |
141 | def deflate(data): | |
142 | if not data: | |
143 | return data | |
144 | try: | |
145 | return zlib.decompress(data, -zlib.MAX_WBITS) | |
146 | except zlib.error: | |
147 | return zlib.decompress(data) | |
148 | ||
149 | @staticmethod | |
150 | def brotli(data): | |
151 | if not data: | |
152 | return data | |
153 | return brotli.decompress(data) | |
154 | ||
155 | @staticmethod | |
156 | def gz(data): | |
59e92b1f SS |
157 | # There may be junk added the end of the file |
158 | # We ignore it by only ever decoding a single gzip payload | |
77bff23e SS |
159 | if not data: |
160 | return data | |
59e92b1f | 161 | return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16) |
c365dba8 | 162 | |
163 | def http_request(self, req): | |
164 | # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not | |
165 | # always respected by websites, some tend to give out URLs with non percent-encoded | |
166 | # non-ASCII characters (see telemb.py, ard.py [#3412]) | |
167 | # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) | |
168 | # To work around aforementioned issue we will replace request's original URL with | |
169 | # percent-encoded one | |
170 | # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) | |
171 | # the code of this workaround has been moved here from YoutubeDL.urlopen() | |
172 | url = req.get_full_url() | |
4bf91228 | 173 | url_escaped = normalize_url(url) |
c365dba8 | 174 | |
175 | # Substitute URL if any change after escaping | |
176 | if url != url_escaped: | |
177 | req = update_Request(req, url=url_escaped) | |
178 | ||
c365dba8 | 179 | return super().do_request_(req) |
180 | ||
181 | def http_response(self, req, resp): | |
182 | old_resp = resp | |
183 | ||
184 | # Content-Encoding header lists the encodings in order that they were applied [1]. | |
185 | # To decompress, we simply do the reverse. | |
186 | # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding | |
187 | decoded_response = None | |
188 | for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): | |
189 | if encoding == 'gzip': | |
190 | decoded_response = self.gz(decoded_response or resp.read()) | |
191 | elif encoding == 'deflate': | |
192 | decoded_response = self.deflate(decoded_response or resp.read()) | |
193 | elif encoding == 'br' and brotli: | |
194 | decoded_response = self.brotli(decoded_response or resp.read()) | |
195 | ||
196 | if decoded_response is not None: | |
197 | resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) | |
198 | resp.msg = old_resp.msg | |
199 | # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see | |
200 | # https://github.com/ytdl-org/youtube-dl/issues/6457). | |
201 | if 300 <= resp.code < 400: | |
202 | location = resp.headers.get('Location') | |
203 | if location: | |
204 | # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 | |
205 | location = location.encode('iso-8859-1').decode() | |
4bf91228 | 206 | location_escaped = normalize_url(location) |
c365dba8 | 207 | if location != location_escaped: |
208 | del resp.headers['Location'] | |
209 | resp.headers['Location'] = location_escaped | |
210 | return resp | |
211 | ||
212 | https_request = http_request | |
213 | https_response = http_response | |
214 | ||
215 | ||
216 | def make_socks_conn_class(base_class, socks_proxy): | |
217 | assert issubclass(base_class, ( | |
218 | http.client.HTTPConnection, http.client.HTTPSConnection)) | |
219 | ||
220 | proxy_args = make_socks_proxy_opts(socks_proxy) | |
221 | ||
222 | class SocksConnection(base_class): | |
223 | def connect(self): | |
224 | self.sock = sockssocket() | |
225 | self.sock.setproxy(**proxy_args) | |
227bf1a3 | 226 | if type(self.timeout) in (int, float): # noqa: E721 |
c365dba8 | 227 | self.sock.settimeout(self.timeout) |
228 | self.sock.connect((self.host, self.port)) | |
229 | ||
230 | if isinstance(self, http.client.HTTPSConnection): | |
227bf1a3 | 231 | self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) |
c365dba8 | 232 | |
233 | return SocksConnection | |
234 | ||
235 | ||
236 | class RedirectHandler(urllib.request.HTTPRedirectHandler): | |
237 | """YoutubeDL redirect handler | |
238 | ||
239 | The code is based on HTTPRedirectHandler implementation from CPython [1]. | |
240 | ||
241 | This redirect handler fixes and improves the logic to better align with RFC7261 | |
242 | and what browsers tend to do [2][3] | |
243 | ||
244 | 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py | |
245 | 2. https://datatracker.ietf.org/doc/html/rfc7231 | |
246 | 3. https://github.com/python/cpython/issues/91306 | |
247 | """ | |
248 | ||
249 | http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 | |
250 | ||
251 | def redirect_request(self, req, fp, code, msg, headers, newurl): | |
252 | if code not in (301, 302, 303, 307, 308): | |
253 | raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) | |
254 | ||
255 | new_data = req.data | |
256 | ||
257 | # Technically the Cookie header should be in unredirected_hdrs, | |
258 | # however in practice some may set it in normal headers anyway. | |
259 | # We will remove it here to prevent any leaks. | |
260 | remove_headers = ['Cookie'] | |
261 | ||
262 | new_method = get_redirect_method(req.get_method(), code) | |
263 | # only remove payload if method changed (e.g. POST to GET) | |
264 | if new_method != req.get_method(): | |
265 | new_data = None | |
266 | remove_headers.extend(['Content-Length', 'Content-Type']) | |
267 | ||
268 | new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers} | |
269 | ||
270 | return urllib.request.Request( | |
271 | newurl, headers=new_headers, origin_req_host=req.origin_req_host, | |
272 | unverifiable=True, method=new_method, data=new_data) | |
273 | ||
274 | ||
227bf1a3 | 275 | class ProxyHandler(urllib.request.BaseHandler): |
276 | handler_order = 100 | |
277 | ||
c365dba8 | 278 | def __init__(self, proxies=None): |
227bf1a3 | 279 | self.proxies = proxies |
c365dba8 | 280 | # Set default handlers |
227bf1a3 | 281 | for type in ('http', 'https', 'ftp'): |
282 | setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r)) | |
283 | ||
284 | def proxy_open(self, req): | |
285 | proxy = select_proxy(req.get_full_url(), self.proxies) | |
286 | if proxy is None: | |
287 | return | |
288 | if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'): | |
c365dba8 | 289 | req.add_header('Ytdl-socks-proxy', proxy) |
290 | # yt-dlp's http/https handlers do wrapping the socket with socks | |
291 | return None | |
292 | return urllib.request.ProxyHandler.proxy_open( | |
227bf1a3 | 293 | self, req, proxy, None) |
c365dba8 | 294 | |
295 | ||
296 | class PUTRequest(urllib.request.Request): | |
297 | def get_method(self): | |
298 | return 'PUT' | |
299 | ||
300 | ||
301 | class HEADRequest(urllib.request.Request): | |
302 | def get_method(self): | |
303 | return 'HEAD' | |
304 | ||
305 | ||
306 | def update_Request(req, url=None, data=None, headers=None, query=None): | |
307 | req_headers = req.headers.copy() | |
308 | req_headers.update(headers or {}) | |
71baa490 | 309 | req_data = data if data is not None else req.data |
c365dba8 | 310 | req_url = update_url_query(url or req.get_full_url(), query) |
311 | req_get_method = req.get_method() | |
312 | if req_get_method == 'HEAD': | |
313 | req_type = HEADRequest | |
314 | elif req_get_method == 'PUT': | |
315 | req_type = PUTRequest | |
316 | else: | |
317 | req_type = urllib.request.Request | |
318 | new_req = req_type( | |
319 | req_url, data=req_data, headers=req_headers, | |
320 | origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) | |
321 | if hasattr(req, 'timeout'): | |
322 | new_req.timeout = req.timeout | |
323 | return new_req | |
227bf1a3 | 324 | |
325 | ||
326 | class UrllibResponseAdapter(Response): | |
327 | """ | |
328 | HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse | |
329 | """ | |
330 | ||
331 | def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl): | |
332 | # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1] | |
333 | # HTTPResponse: .getcode() was deprecated, .status always existed [2] | |
334 | # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode | |
335 | # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status | |
336 | super().__init__( | |
337 | fp=res, headers=res.headers, url=res.url, | |
338 | status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None)) | |
339 | ||
340 | def read(self, amt=None): | |
341 | try: | |
342 | return self.fp.read(amt) | |
343 | except Exception as e: | |
344 | handle_response_read_exceptions(e) | |
345 | raise e | |
346 | ||
347 | ||
348 | def handle_sslerror(e: ssl.SSLError): | |
349 | if not isinstance(e, ssl.SSLError): | |
350 | return | |
351 | if isinstance(e, ssl.SSLCertVerificationError): | |
352 | raise CertificateVerifyError(cause=e) from e | |
353 | raise SSLError(cause=e) from e | |
354 | ||
355 | ||
356 | def handle_response_read_exceptions(e): | |
357 | if isinstance(e, http.client.IncompleteRead): | |
358 | raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e | |
359 | elif isinstance(e, ssl.SSLError): | |
360 | handle_sslerror(e) | |
361 | elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)): | |
362 | # OSErrors raised here should mostly be network related | |
363 | raise TransportError(cause=e) from e | |
364 | ||
365 | ||
62b5c94c | 366 | @register_rh |
227bf1a3 | 367 | class UrllibRH(RequestHandler, InstanceStoreMixin): |
368 | _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp') | |
369 | _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h') | |
370 | _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) | |
371 | RH_NAME = 'urllib' | |
372 | ||
373 | def __init__(self, *, enable_file_urls: bool = False, **kwargs): | |
374 | super().__init__(**kwargs) | |
375 | self.enable_file_urls = enable_file_urls | |
376 | if self.enable_file_urls: | |
377 | self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file') | |
378 | ||
86aea0d3 | 379 | def _check_extensions(self, extensions): |
380 | super()._check_extensions(extensions) | |
381 | extensions.pop('cookiejar', None) | |
382 | extensions.pop('timeout', None) | |
383 | ||
227bf1a3 | 384 | def _create_instance(self, proxies, cookiejar): |
385 | opener = urllib.request.OpenerDirector() | |
386 | handlers = [ | |
387 | ProxyHandler(proxies), | |
388 | HTTPHandler( | |
389 | debuglevel=int(bool(self.verbose)), | |
390 | context=self._make_sslcontext(), | |
391 | source_address=self.source_address), | |
392 | HTTPCookieProcessor(cookiejar), | |
393 | DataHandler(), | |
394 | UnknownHandler(), | |
395 | HTTPDefaultErrorHandler(), | |
396 | FTPHandler(), | |
397 | HTTPErrorProcessor(), | |
398 | RedirectHandler(), | |
399 | ] | |
400 | ||
401 | if self.enable_file_urls: | |
402 | handlers.append(FileHandler()) | |
403 | ||
404 | for handler in handlers: | |
405 | opener.add_handler(handler) | |
406 | ||
407 | # Delete the default user-agent header, which would otherwise apply in | |
408 | # cases where our custom HTTP handler doesn't come into play | |
409 | # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) | |
410 | opener.addheaders = [] | |
411 | return opener | |
412 | ||
413 | def _send(self, request): | |
414 | headers = self._merge_headers(request.headers) | |
415 | add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) | |
416 | urllib_req = urllib.request.Request( | |
417 | url=request.url, | |
418 | data=request.data, | |
419 | headers=dict(headers), | |
420 | method=request.method | |
421 | ) | |
422 | ||
423 | opener = self._get_instance( | |
424 | proxies=request.proxies or self.proxies, | |
425 | cookiejar=request.extensions.get('cookiejar') or self.cookiejar | |
426 | ) | |
427 | try: | |
428 | res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout)) | |
429 | except urllib.error.HTTPError as e: | |
430 | if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): | |
431 | # Prevent file object from being closed when urllib.error.HTTPError is destroyed. | |
836e06d2 | 432 | e._closer.close_called = True |
227bf1a3 | 433 | raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e |
434 | raise # unexpected | |
435 | except urllib.error.URLError as e: | |
436 | cause = e.reason # NOTE: cause may be a string | |
437 | ||
438 | # proxy errors | |
439 | if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError): | |
440 | raise ProxyError(cause=e) from e | |
441 | ||
442 | handle_response_read_exceptions(cause) | |
443 | raise TransportError(cause=e) from e | |
444 | except (http.client.InvalidURL, ValueError) as e: | |
445 | # Validation errors | |
446 | # http.client.HTTPConnection raises ValueError in some validation cases | |
447 | # such as if request method contains illegal control characters [1] | |
448 | # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 | |
449 | raise RequestError(cause=e) from e | |
450 | except Exception as e: | |
451 | handle_response_read_exceptions(e) | |
452 | raise # unexpected | |
453 | ||
454 | return UrllibResponseAdapter(res) |