]>
Commit | Line | Data |
---|---|---|
227bf1a3 | 1 | from __future__ import annotations |
2 | ||
c365dba8 | 3 | import functools |
c365dba8 | 4 | import http.client |
5 | import io | |
6 | import socket | |
7 | import ssl | |
8 | import urllib.error | |
9 | import urllib.parse | |
10 | import urllib.request | |
11 | import urllib.response | |
12 | import zlib | |
227bf1a3 | 13 | from urllib.request import ( |
14 | DataHandler, | |
15 | FileHandler, | |
16 | FTPHandler, | |
17 | HTTPCookieProcessor, | |
18 | HTTPDefaultErrorHandler, | |
19 | HTTPErrorProcessor, | |
20 | UnknownHandler, | |
21 | ) | |
c365dba8 | 22 | |
23 | from ._helper import ( | |
227bf1a3 | 24 | InstanceStoreMixin, |
c365dba8 | 25 | add_accept_encoding_header, |
20fbbd92 | 26 | create_connection, |
c365dba8 | 27 | get_redirect_method, |
28 | make_socks_proxy_opts, | |
227bf1a3 | 29 | select_proxy, |
30 | ) | |
62b5c94c | 31 | from .common import Features, RequestHandler, Response, register_rh |
227bf1a3 | 32 | from .exceptions import ( |
33 | CertificateVerifyError, | |
34 | HTTPError, | |
35 | IncompleteRead, | |
36 | ProxyError, | |
37 | RequestError, | |
38 | SSLError, | |
39 | TransportError, | |
c365dba8 | 40 | ) |
41 | from ..dependencies import brotli | |
227bf1a3 | 42 | from ..socks import ProxyError as SocksProxyError |
c365dba8 | 43 | from ..socks import sockssocket |
4bf91228 | 44 | from ..utils import update_url_query |
45 | from ..utils.networking import normalize_url | |
c365dba8 | 46 | |
47 | SUPPORTED_ENCODINGS = ['gzip', 'deflate'] | |
227bf1a3 | 48 | CONTENT_DECODE_ERRORS = [zlib.error, OSError] |
c365dba8 | 49 | |
50 | if brotli: | |
51 | SUPPORTED_ENCODINGS.append('br') | |
227bf1a3 | 52 | CONTENT_DECODE_ERRORS.append(brotli.error) |
c365dba8 | 53 | |
54 | ||
227bf1a3 | 55 | def _create_http_connection(http_class, source_address, *args, **kwargs): |
c365dba8 | 56 | hc = http_class(*args, **kwargs) |
c365dba8 | 57 | |
20fbbd92 | 58 | if hasattr(hc, '_create_connection'): |
59 | hc._create_connection = create_connection | |
60 | ||
c365dba8 | 61 | if source_address is not None: |
c365dba8 | 62 | hc.source_address = (source_address, 0) |
63 | ||
64 | return hc | |
65 | ||
66 | ||
227bf1a3 | 67 | class HTTPHandler(urllib.request.AbstractHTTPHandler): |
c365dba8 | 68 | """Handler for HTTP requests and responses. |
69 | ||
70 | This class, when installed with an OpenerDirector, automatically adds | |
71 | the standard headers to every HTTP request and handles gzipped, deflated and | |
72 | brotli responses from web servers. | |
73 | ||
74 | Part of this code was copied from: | |
75 | ||
76 | http://techknack.net/python-urllib2-handlers/ | |
77 | ||
78 | Andrew Rowls, the author of that code, agreed to release it to the | |
79 | public domain. | |
80 | """ | |
81 | ||
227bf1a3 | 82 | def __init__(self, context=None, source_address=None, *args, **kwargs): |
83 | super().__init__(*args, **kwargs) | |
84 | self._source_address = source_address | |
85 | self._context = context | |
c365dba8 | 86 | |
227bf1a3 | 87 | @staticmethod |
88 | def _make_conn_class(base, req): | |
89 | conn_class = base | |
90 | socks_proxy = req.headers.pop('Ytdl-socks-proxy', None) | |
c365dba8 | 91 | if socks_proxy: |
92 | conn_class = make_socks_conn_class(conn_class, socks_proxy) | |
227bf1a3 | 93 | return conn_class |
c365dba8 | 94 | |
227bf1a3 | 95 | def http_open(self, req): |
96 | conn_class = self._make_conn_class(http.client.HTTPConnection, req) | |
c365dba8 | 97 | return self.do_open(functools.partial( |
227bf1a3 | 98 | _create_http_connection, conn_class, self._source_address), req) |
99 | ||
100 | def https_open(self, req): | |
101 | conn_class = self._make_conn_class(http.client.HTTPSConnection, req) | |
102 | return self.do_open( | |
103 | functools.partial( | |
104 | _create_http_connection, conn_class, self._source_address), | |
105 | req, context=self._context) | |
c365dba8 | 106 | |
107 | @staticmethod | |
108 | def deflate(data): | |
109 | if not data: | |
110 | return data | |
111 | try: | |
112 | return zlib.decompress(data, -zlib.MAX_WBITS) | |
113 | except zlib.error: | |
114 | return zlib.decompress(data) | |
115 | ||
116 | @staticmethod | |
117 | def brotli(data): | |
118 | if not data: | |
119 | return data | |
120 | return brotli.decompress(data) | |
121 | ||
122 | @staticmethod | |
123 | def gz(data): | |
59e92b1f SS |
124 | # There may be junk added the end of the file |
125 | # We ignore it by only ever decoding a single gzip payload | |
77bff23e SS |
126 | if not data: |
127 | return data | |
59e92b1f | 128 | return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16) |
c365dba8 | 129 | |
130 | def http_request(self, req): | |
131 | # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not | |
132 | # always respected by websites, some tend to give out URLs with non percent-encoded | |
133 | # non-ASCII characters (see telemb.py, ard.py [#3412]) | |
134 | # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) | |
135 | # To work around aforementioned issue we will replace request's original URL with | |
136 | # percent-encoded one | |
137 | # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) | |
138 | # the code of this workaround has been moved here from YoutubeDL.urlopen() | |
139 | url = req.get_full_url() | |
4bf91228 | 140 | url_escaped = normalize_url(url) |
c365dba8 | 141 | |
142 | # Substitute URL if any change after escaping | |
143 | if url != url_escaped: | |
144 | req = update_Request(req, url=url_escaped) | |
145 | ||
c365dba8 | 146 | return super().do_request_(req) |
147 | ||
148 | def http_response(self, req, resp): | |
149 | old_resp = resp | |
150 | ||
151 | # Content-Encoding header lists the encodings in order that they were applied [1]. | |
152 | # To decompress, we simply do the reverse. | |
153 | # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding | |
154 | decoded_response = None | |
155 | for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): | |
156 | if encoding == 'gzip': | |
157 | decoded_response = self.gz(decoded_response or resp.read()) | |
158 | elif encoding == 'deflate': | |
159 | decoded_response = self.deflate(decoded_response or resp.read()) | |
160 | elif encoding == 'br' and brotli: | |
161 | decoded_response = self.brotli(decoded_response or resp.read()) | |
162 | ||
163 | if decoded_response is not None: | |
164 | resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) | |
165 | resp.msg = old_resp.msg | |
166 | # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see | |
167 | # https://github.com/ytdl-org/youtube-dl/issues/6457). | |
168 | if 300 <= resp.code < 400: | |
169 | location = resp.headers.get('Location') | |
170 | if location: | |
171 | # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 | |
172 | location = location.encode('iso-8859-1').decode() | |
4bf91228 | 173 | location_escaped = normalize_url(location) |
c365dba8 | 174 | if location != location_escaped: |
175 | del resp.headers['Location'] | |
176 | resp.headers['Location'] = location_escaped | |
177 | return resp | |
178 | ||
179 | https_request = http_request | |
180 | https_response = http_response | |
181 | ||
182 | ||
183 | def make_socks_conn_class(base_class, socks_proxy): | |
184 | assert issubclass(base_class, ( | |
185 | http.client.HTTPConnection, http.client.HTTPSConnection)) | |
186 | ||
187 | proxy_args = make_socks_proxy_opts(socks_proxy) | |
188 | ||
189 | class SocksConnection(base_class): | |
20fbbd92 | 190 | _create_connection = create_connection |
c365dba8 | 191 | |
20fbbd92 | 192 | def connect(self): |
193 | def sock_socket_connect(ip_addr, timeout, source_address): | |
194 | af, socktype, proto, canonname, sa = ip_addr | |
195 | sock = sockssocket(af, socktype, proto) | |
196 | try: | |
197 | connect_proxy_args = proxy_args.copy() | |
198 | connect_proxy_args.update({'addr': sa[0], 'port': sa[1]}) | |
199 | sock.setproxy(**connect_proxy_args) | |
200 | if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721 | |
201 | sock.settimeout(timeout) | |
202 | if source_address: | |
203 | sock.bind(source_address) | |
204 | sock.connect((self.host, self.port)) | |
205 | return sock | |
206 | except socket.error: | |
207 | sock.close() | |
208 | raise | |
209 | self.sock = create_connection( | |
210 | (proxy_args['addr'], proxy_args['port']), timeout=self.timeout, | |
211 | source_address=self.source_address, _create_socket_func=sock_socket_connect) | |
c365dba8 | 212 | if isinstance(self, http.client.HTTPSConnection): |
227bf1a3 | 213 | self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) |
c365dba8 | 214 | |
215 | return SocksConnection | |
216 | ||
217 | ||
218 | class RedirectHandler(urllib.request.HTTPRedirectHandler): | |
219 | """YoutubeDL redirect handler | |
220 | ||
221 | The code is based on HTTPRedirectHandler implementation from CPython [1]. | |
222 | ||
223 | This redirect handler fixes and improves the logic to better align with RFC7261 | |
224 | and what browsers tend to do [2][3] | |
225 | ||
226 | 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py | |
227 | 2. https://datatracker.ietf.org/doc/html/rfc7231 | |
228 | 3. https://github.com/python/cpython/issues/91306 | |
229 | """ | |
230 | ||
231 | http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 | |
232 | ||
233 | def redirect_request(self, req, fp, code, msg, headers, newurl): | |
234 | if code not in (301, 302, 303, 307, 308): | |
235 | raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) | |
236 | ||
237 | new_data = req.data | |
238 | ||
239 | # Technically the Cookie header should be in unredirected_hdrs, | |
240 | # however in practice some may set it in normal headers anyway. | |
241 | # We will remove it here to prevent any leaks. | |
242 | remove_headers = ['Cookie'] | |
243 | ||
244 | new_method = get_redirect_method(req.get_method(), code) | |
245 | # only remove payload if method changed (e.g. POST to GET) | |
246 | if new_method != req.get_method(): | |
247 | new_data = None | |
248 | remove_headers.extend(['Content-Length', 'Content-Type']) | |
249 | ||
250 | new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers} | |
251 | ||
252 | return urllib.request.Request( | |
253 | newurl, headers=new_headers, origin_req_host=req.origin_req_host, | |
254 | unverifiable=True, method=new_method, data=new_data) | |
255 | ||
256 | ||
227bf1a3 | 257 | class ProxyHandler(urllib.request.BaseHandler): |
258 | handler_order = 100 | |
259 | ||
c365dba8 | 260 | def __init__(self, proxies=None): |
227bf1a3 | 261 | self.proxies = proxies |
c365dba8 | 262 | # Set default handlers |
227bf1a3 | 263 | for type in ('http', 'https', 'ftp'): |
264 | setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r)) | |
265 | ||
266 | def proxy_open(self, req): | |
267 | proxy = select_proxy(req.get_full_url(), self.proxies) | |
268 | if proxy is None: | |
269 | return | |
270 | if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'): | |
c365dba8 | 271 | req.add_header('Ytdl-socks-proxy', proxy) |
272 | # yt-dlp's http/https handlers do wrapping the socket with socks | |
273 | return None | |
274 | return urllib.request.ProxyHandler.proxy_open( | |
227bf1a3 | 275 | self, req, proxy, None) |
c365dba8 | 276 | |
277 | ||
278 | class PUTRequest(urllib.request.Request): | |
279 | def get_method(self): | |
280 | return 'PUT' | |
281 | ||
282 | ||
283 | class HEADRequest(urllib.request.Request): | |
284 | def get_method(self): | |
285 | return 'HEAD' | |
286 | ||
287 | ||
288 | def update_Request(req, url=None, data=None, headers=None, query=None): | |
289 | req_headers = req.headers.copy() | |
290 | req_headers.update(headers or {}) | |
71baa490 | 291 | req_data = data if data is not None else req.data |
c365dba8 | 292 | req_url = update_url_query(url or req.get_full_url(), query) |
293 | req_get_method = req.get_method() | |
294 | if req_get_method == 'HEAD': | |
295 | req_type = HEADRequest | |
296 | elif req_get_method == 'PUT': | |
297 | req_type = PUTRequest | |
298 | else: | |
299 | req_type = urllib.request.Request | |
300 | new_req = req_type( | |
301 | req_url, data=req_data, headers=req_headers, | |
302 | origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) | |
303 | if hasattr(req, 'timeout'): | |
304 | new_req.timeout = req.timeout | |
305 | return new_req | |
227bf1a3 | 306 | |
307 | ||
308 | class UrllibResponseAdapter(Response): | |
309 | """ | |
310 | HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse | |
311 | """ | |
312 | ||
313 | def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl): | |
314 | # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1] | |
315 | # HTTPResponse: .getcode() was deprecated, .status always existed [2] | |
316 | # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode | |
317 | # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status | |
318 | super().__init__( | |
319 | fp=res, headers=res.headers, url=res.url, | |
320 | status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None)) | |
321 | ||
322 | def read(self, amt=None): | |
323 | try: | |
324 | return self.fp.read(amt) | |
325 | except Exception as e: | |
326 | handle_response_read_exceptions(e) | |
327 | raise e | |
328 | ||
329 | ||
330 | def handle_sslerror(e: ssl.SSLError): | |
331 | if not isinstance(e, ssl.SSLError): | |
332 | return | |
333 | if isinstance(e, ssl.SSLCertVerificationError): | |
334 | raise CertificateVerifyError(cause=e) from e | |
335 | raise SSLError(cause=e) from e | |
336 | ||
337 | ||
338 | def handle_response_read_exceptions(e): | |
339 | if isinstance(e, http.client.IncompleteRead): | |
5ca095cb | 340 | raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e |
227bf1a3 | 341 | elif isinstance(e, ssl.SSLError): |
342 | handle_sslerror(e) | |
343 | elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)): | |
344 | # OSErrors raised here should mostly be network related | |
345 | raise TransportError(cause=e) from e | |
346 | ||
347 | ||
62b5c94c | 348 | @register_rh |
227bf1a3 | 349 | class UrllibRH(RequestHandler, InstanceStoreMixin): |
350 | _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp') | |
351 | _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h') | |
352 | _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) | |
353 | RH_NAME = 'urllib' | |
354 | ||
355 | def __init__(self, *, enable_file_urls: bool = False, **kwargs): | |
356 | super().__init__(**kwargs) | |
357 | self.enable_file_urls = enable_file_urls | |
358 | if self.enable_file_urls: | |
359 | self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file') | |
360 | ||
86aea0d3 | 361 | def _check_extensions(self, extensions): |
362 | super()._check_extensions(extensions) | |
363 | extensions.pop('cookiejar', None) | |
364 | extensions.pop('timeout', None) | |
365 | ||
227bf1a3 | 366 | def _create_instance(self, proxies, cookiejar): |
367 | opener = urllib.request.OpenerDirector() | |
368 | handlers = [ | |
369 | ProxyHandler(proxies), | |
370 | HTTPHandler( | |
371 | debuglevel=int(bool(self.verbose)), | |
372 | context=self._make_sslcontext(), | |
373 | source_address=self.source_address), | |
374 | HTTPCookieProcessor(cookiejar), | |
375 | DataHandler(), | |
376 | UnknownHandler(), | |
377 | HTTPDefaultErrorHandler(), | |
378 | FTPHandler(), | |
379 | HTTPErrorProcessor(), | |
380 | RedirectHandler(), | |
381 | ] | |
382 | ||
383 | if self.enable_file_urls: | |
384 | handlers.append(FileHandler()) | |
385 | ||
386 | for handler in handlers: | |
387 | opener.add_handler(handler) | |
388 | ||
389 | # Delete the default user-agent header, which would otherwise apply in | |
390 | # cases where our custom HTTP handler doesn't come into play | |
391 | # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) | |
392 | opener.addheaders = [] | |
393 | return opener | |
394 | ||
395 | def _send(self, request): | |
396 | headers = self._merge_headers(request.headers) | |
397 | add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) | |
398 | urllib_req = urllib.request.Request( | |
399 | url=request.url, | |
400 | data=request.data, | |
401 | headers=dict(headers), | |
402 | method=request.method | |
403 | ) | |
404 | ||
405 | opener = self._get_instance( | |
406 | proxies=request.proxies or self.proxies, | |
407 | cookiejar=request.extensions.get('cookiejar') or self.cookiejar | |
408 | ) | |
409 | try: | |
410 | res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout)) | |
411 | except urllib.error.HTTPError as e: | |
412 | if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): | |
413 | # Prevent file object from being closed when urllib.error.HTTPError is destroyed. | |
836e06d2 | 414 | e._closer.close_called = True |
227bf1a3 | 415 | raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e |
416 | raise # unexpected | |
417 | except urllib.error.URLError as e: | |
418 | cause = e.reason # NOTE: cause may be a string | |
419 | ||
420 | # proxy errors | |
421 | if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError): | |
422 | raise ProxyError(cause=e) from e | |
423 | ||
424 | handle_response_read_exceptions(cause) | |
425 | raise TransportError(cause=e) from e | |
426 | except (http.client.InvalidURL, ValueError) as e: | |
427 | # Validation errors | |
428 | # http.client.HTTPConnection raises ValueError in some validation cases | |
429 | # such as if request method contains illegal control characters [1] | |
430 | # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 | |
431 | raise RequestError(cause=e) from e | |
432 | except Exception as e: | |
433 | handle_response_read_exceptions(e) | |
434 | raise # unexpected | |
435 | ||
436 | return UrllibResponseAdapter(res) |