]>
Commit | Line | Data |
---|---|---|
1 | from __future__ import annotations | |
2 | ||
3 | import functools | |
4 | import http.client | |
5 | import io | |
6 | import ssl | |
7 | import urllib.error | |
8 | import urllib.parse | |
9 | import urllib.request | |
10 | import urllib.response | |
11 | import zlib | |
12 | from urllib.request import ( | |
13 | DataHandler, | |
14 | FileHandler, | |
15 | FTPHandler, | |
16 | HTTPCookieProcessor, | |
17 | HTTPDefaultErrorHandler, | |
18 | HTTPErrorProcessor, | |
19 | UnknownHandler, | |
20 | ) | |
21 | ||
22 | from ._helper import ( | |
23 | InstanceStoreMixin, | |
24 | add_accept_encoding_header, | |
25 | create_connection, | |
26 | create_socks_proxy_socket, | |
27 | get_redirect_method, | |
28 | make_socks_proxy_opts, | |
29 | select_proxy, | |
30 | ) | |
31 | from .common import Features, RequestHandler, Response, register_rh | |
32 | from .exceptions import ( | |
33 | CertificateVerifyError, | |
34 | HTTPError, | |
35 | IncompleteRead, | |
36 | ProxyError, | |
37 | RequestError, | |
38 | SSLError, | |
39 | TransportError, | |
40 | ) | |
41 | from ..dependencies import brotli | |
42 | from ..socks import ProxyError as SocksProxyError | |
43 | from ..utils import update_url_query | |
44 | from ..utils.networking import normalize_url | |
45 | ||
46 | SUPPORTED_ENCODINGS = ['gzip', 'deflate'] | |
47 | CONTENT_DECODE_ERRORS = [zlib.error, OSError] | |
48 | ||
49 | if brotli: | |
50 | SUPPORTED_ENCODINGS.append('br') | |
51 | CONTENT_DECODE_ERRORS.append(brotli.error) | |
52 | ||
53 | ||
54 | def _create_http_connection(http_class, source_address, *args, **kwargs): | |
55 | hc = http_class(*args, **kwargs) | |
56 | ||
57 | if hasattr(hc, '_create_connection'): | |
58 | hc._create_connection = create_connection | |
59 | ||
60 | if source_address is not None: | |
61 | hc.source_address = (source_address, 0) | |
62 | ||
63 | return hc | |
64 | ||
65 | ||
66 | class HTTPHandler(urllib.request.AbstractHTTPHandler): | |
67 | """Handler for HTTP requests and responses. | |
68 | ||
69 | This class, when installed with an OpenerDirector, automatically adds | |
70 | the standard headers to every HTTP request and handles gzipped, deflated and | |
71 | brotli responses from web servers. | |
72 | ||
73 | Part of this code was copied from: | |
74 | ||
75 | http://techknack.net/python-urllib2-handlers/ | |
76 | ||
77 | Andrew Rowls, the author of that code, agreed to release it to the | |
78 | public domain. | |
79 | """ | |
80 | ||
81 | def __init__(self, context=None, source_address=None, *args, **kwargs): | |
82 | super().__init__(*args, **kwargs) | |
83 | self._source_address = source_address | |
84 | self._context = context | |
85 | ||
86 | @staticmethod | |
87 | def _make_conn_class(base, req): | |
88 | conn_class = base | |
89 | socks_proxy = req.headers.pop('Ytdl-socks-proxy', None) | |
90 | if socks_proxy: | |
91 | conn_class = make_socks_conn_class(conn_class, socks_proxy) | |
92 | return conn_class | |
93 | ||
94 | def http_open(self, req): | |
95 | conn_class = self._make_conn_class(http.client.HTTPConnection, req) | |
96 | return self.do_open(functools.partial( | |
97 | _create_http_connection, conn_class, self._source_address), req) | |
98 | ||
99 | def https_open(self, req): | |
100 | conn_class = self._make_conn_class(http.client.HTTPSConnection, req) | |
101 | return self.do_open( | |
102 | functools.partial( | |
103 | _create_http_connection, conn_class, self._source_address), | |
104 | req, context=self._context) | |
105 | ||
106 | @staticmethod | |
107 | def deflate(data): | |
108 | if not data: | |
109 | return data | |
110 | try: | |
111 | return zlib.decompress(data, -zlib.MAX_WBITS) | |
112 | except zlib.error: | |
113 | return zlib.decompress(data) | |
114 | ||
115 | @staticmethod | |
116 | def brotli(data): | |
117 | if not data: | |
118 | return data | |
119 | return brotli.decompress(data) | |
120 | ||
121 | @staticmethod | |
122 | def gz(data): | |
123 | # There may be junk added the end of the file | |
124 | # We ignore it by only ever decoding a single gzip payload | |
125 | if not data: | |
126 | return data | |
127 | return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16) | |
128 | ||
129 | def http_request(self, req): | |
130 | # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not | |
131 | # always respected by websites, some tend to give out URLs with non percent-encoded | |
132 | # non-ASCII characters (see telemb.py, ard.py [#3412]) | |
133 | # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) | |
134 | # To work around aforementioned issue we will replace request's original URL with | |
135 | # percent-encoded one | |
136 | # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) | |
137 | # the code of this workaround has been moved here from YoutubeDL.urlopen() | |
138 | url = req.get_full_url() | |
139 | url_escaped = normalize_url(url) | |
140 | ||
141 | # Substitute URL if any change after escaping | |
142 | if url != url_escaped: | |
143 | req = update_Request(req, url=url_escaped) | |
144 | ||
145 | return super().do_request_(req) | |
146 | ||
147 | def http_response(self, req, resp): | |
148 | old_resp = resp | |
149 | ||
150 | # Content-Encoding header lists the encodings in order that they were applied [1]. | |
151 | # To decompress, we simply do the reverse. | |
152 | # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding | |
153 | decoded_response = None | |
154 | for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): | |
155 | if encoding == 'gzip': | |
156 | decoded_response = self.gz(decoded_response or resp.read()) | |
157 | elif encoding == 'deflate': | |
158 | decoded_response = self.deflate(decoded_response or resp.read()) | |
159 | elif encoding == 'br' and brotli: | |
160 | decoded_response = self.brotli(decoded_response or resp.read()) | |
161 | ||
162 | if decoded_response is not None: | |
163 | resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) | |
164 | resp.msg = old_resp.msg | |
165 | # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see | |
166 | # https://github.com/ytdl-org/youtube-dl/issues/6457). | |
167 | if 300 <= resp.code < 400: | |
168 | location = resp.headers.get('Location') | |
169 | if location: | |
170 | # As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3 | |
171 | location = location.encode('iso-8859-1').decode() | |
172 | location_escaped = normalize_url(location) | |
173 | if location != location_escaped: | |
174 | del resp.headers['Location'] | |
175 | resp.headers['Location'] = location_escaped | |
176 | return resp | |
177 | ||
178 | https_request = http_request | |
179 | https_response = http_response | |
180 | ||
181 | ||
182 | def make_socks_conn_class(base_class, socks_proxy): | |
183 | assert issubclass(base_class, ( | |
184 | http.client.HTTPConnection, http.client.HTTPSConnection)) | |
185 | ||
186 | proxy_args = make_socks_proxy_opts(socks_proxy) | |
187 | ||
188 | class SocksConnection(base_class): | |
189 | _create_connection = create_connection | |
190 | ||
191 | def connect(self): | |
192 | self.sock = create_connection( | |
193 | (proxy_args['addr'], proxy_args['port']), | |
194 | timeout=self.timeout, | |
195 | source_address=self.source_address, | |
196 | _create_socket_func=functools.partial( | |
197 | create_socks_proxy_socket, (self.host, self.port), proxy_args)) | |
198 | if isinstance(self, http.client.HTTPSConnection): | |
199 | self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) | |
200 | ||
201 | return SocksConnection | |
202 | ||
203 | ||
204 | class RedirectHandler(urllib.request.HTTPRedirectHandler): | |
205 | """YoutubeDL redirect handler | |
206 | ||
207 | The code is based on HTTPRedirectHandler implementation from CPython [1]. | |
208 | ||
209 | This redirect handler fixes and improves the logic to better align with RFC7261 | |
210 | and what browsers tend to do [2][3] | |
211 | ||
212 | 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py | |
213 | 2. https://datatracker.ietf.org/doc/html/rfc7231 | |
214 | 3. https://github.com/python/cpython/issues/91306 | |
215 | """ | |
216 | ||
217 | http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 | |
218 | ||
219 | def redirect_request(self, req, fp, code, msg, headers, newurl): | |
220 | if code not in (301, 302, 303, 307, 308): | |
221 | raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) | |
222 | ||
223 | new_data = req.data | |
224 | ||
225 | # Technically the Cookie header should be in unredirected_hdrs, | |
226 | # however in practice some may set it in normal headers anyway. | |
227 | # We will remove it here to prevent any leaks. | |
228 | remove_headers = ['Cookie'] | |
229 | ||
230 | new_method = get_redirect_method(req.get_method(), code) | |
231 | # only remove payload if method changed (e.g. POST to GET) | |
232 | if new_method != req.get_method(): | |
233 | new_data = None | |
234 | remove_headers.extend(['Content-Length', 'Content-Type']) | |
235 | ||
236 | new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers} | |
237 | ||
238 | return urllib.request.Request( | |
239 | newurl, headers=new_headers, origin_req_host=req.origin_req_host, | |
240 | unverifiable=True, method=new_method, data=new_data) | |
241 | ||
242 | ||
243 | class ProxyHandler(urllib.request.BaseHandler): | |
244 | handler_order = 100 | |
245 | ||
246 | def __init__(self, proxies=None): | |
247 | self.proxies = proxies | |
248 | # Set default handlers | |
249 | for type in ('http', 'https', 'ftp'): | |
250 | setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r)) | |
251 | ||
252 | def proxy_open(self, req): | |
253 | proxy = select_proxy(req.get_full_url(), self.proxies) | |
254 | if proxy is None: | |
255 | return | |
256 | if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'): | |
257 | req.add_header('Ytdl-socks-proxy', proxy) | |
258 | # yt-dlp's http/https handlers do wrapping the socket with socks | |
259 | return None | |
260 | return urllib.request.ProxyHandler.proxy_open( | |
261 | self, req, proxy, None) | |
262 | ||
263 | ||
264 | class PUTRequest(urllib.request.Request): | |
265 | def get_method(self): | |
266 | return 'PUT' | |
267 | ||
268 | ||
269 | class HEADRequest(urllib.request.Request): | |
270 | def get_method(self): | |
271 | return 'HEAD' | |
272 | ||
273 | ||
274 | def update_Request(req, url=None, data=None, headers=None, query=None): | |
275 | req_headers = req.headers.copy() | |
276 | req_headers.update(headers or {}) | |
277 | req_data = data if data is not None else req.data | |
278 | req_url = update_url_query(url or req.get_full_url(), query) | |
279 | req_get_method = req.get_method() | |
280 | if req_get_method == 'HEAD': | |
281 | req_type = HEADRequest | |
282 | elif req_get_method == 'PUT': | |
283 | req_type = PUTRequest | |
284 | else: | |
285 | req_type = urllib.request.Request | |
286 | new_req = req_type( | |
287 | req_url, data=req_data, headers=req_headers, | |
288 | origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) | |
289 | if hasattr(req, 'timeout'): | |
290 | new_req.timeout = req.timeout | |
291 | return new_req | |
292 | ||
293 | ||
294 | class UrllibResponseAdapter(Response): | |
295 | """ | |
296 | HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse | |
297 | """ | |
298 | ||
299 | def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl): | |
300 | # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1] | |
301 | # HTTPResponse: .getcode() was deprecated, .status always existed [2] | |
302 | # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode | |
303 | # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status | |
304 | super().__init__( | |
305 | fp=res, headers=res.headers, url=res.url, | |
306 | status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None)) | |
307 | ||
308 | def read(self, amt=None): | |
309 | try: | |
310 | return self.fp.read(amt) | |
311 | except Exception as e: | |
312 | handle_response_read_exceptions(e) | |
313 | raise e | |
314 | ||
315 | ||
316 | def handle_sslerror(e: ssl.SSLError): | |
317 | if not isinstance(e, ssl.SSLError): | |
318 | return | |
319 | if isinstance(e, ssl.SSLCertVerificationError): | |
320 | raise CertificateVerifyError(cause=e) from e | |
321 | raise SSLError(cause=e) from e | |
322 | ||
323 | ||
324 | def handle_response_read_exceptions(e): | |
325 | if isinstance(e, http.client.IncompleteRead): | |
326 | raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e | |
327 | elif isinstance(e, ssl.SSLError): | |
328 | handle_sslerror(e) | |
329 | elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)): | |
330 | # OSErrors raised here should mostly be network related | |
331 | raise TransportError(cause=e) from e | |
332 | ||
333 | ||
334 | @register_rh | |
335 | class UrllibRH(RequestHandler, InstanceStoreMixin): | |
336 | _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp') | |
337 | _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h') | |
338 | _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) | |
339 | RH_NAME = 'urllib' | |
340 | ||
341 | def __init__(self, *, enable_file_urls: bool = False, **kwargs): | |
342 | super().__init__(**kwargs) | |
343 | self.enable_file_urls = enable_file_urls | |
344 | if self.enable_file_urls: | |
345 | self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file') | |
346 | ||
347 | def _check_extensions(self, extensions): | |
348 | super()._check_extensions(extensions) | |
349 | extensions.pop('cookiejar', None) | |
350 | extensions.pop('timeout', None) | |
351 | ||
352 | def _create_instance(self, proxies, cookiejar): | |
353 | opener = urllib.request.OpenerDirector() | |
354 | handlers = [ | |
355 | ProxyHandler(proxies), | |
356 | HTTPHandler( | |
357 | debuglevel=int(bool(self.verbose)), | |
358 | context=self._make_sslcontext(), | |
359 | source_address=self.source_address), | |
360 | HTTPCookieProcessor(cookiejar), | |
361 | DataHandler(), | |
362 | UnknownHandler(), | |
363 | HTTPDefaultErrorHandler(), | |
364 | FTPHandler(), | |
365 | HTTPErrorProcessor(), | |
366 | RedirectHandler(), | |
367 | ] | |
368 | ||
369 | if self.enable_file_urls: | |
370 | handlers.append(FileHandler()) | |
371 | ||
372 | for handler in handlers: | |
373 | opener.add_handler(handler) | |
374 | ||
375 | # Delete the default user-agent header, which would otherwise apply in | |
376 | # cases where our custom HTTP handler doesn't come into play | |
377 | # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) | |
378 | opener.addheaders = [] | |
379 | return opener | |
380 | ||
381 | def _send(self, request): | |
382 | headers = self._merge_headers(request.headers) | |
383 | add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) | |
384 | urllib_req = urllib.request.Request( | |
385 | url=request.url, | |
386 | data=request.data, | |
387 | headers=dict(headers), | |
388 | method=request.method | |
389 | ) | |
390 | ||
391 | opener = self._get_instance( | |
392 | proxies=self._get_proxies(request), | |
393 | cookiejar=self._get_cookiejar(request) | |
394 | ) | |
395 | try: | |
396 | res = opener.open(urllib_req, timeout=self._calculate_timeout(request)) | |
397 | except urllib.error.HTTPError as e: | |
398 | if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): | |
399 | # Prevent file object from being closed when urllib.error.HTTPError is destroyed. | |
400 | e._closer.close_called = True | |
401 | raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e | |
402 | raise # unexpected | |
403 | except urllib.error.URLError as e: | |
404 | cause = e.reason # NOTE: cause may be a string | |
405 | ||
406 | # proxy errors | |
407 | if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError): | |
408 | raise ProxyError(cause=e) from e | |
409 | ||
410 | handle_response_read_exceptions(cause) | |
411 | raise TransportError(cause=e) from e | |
412 | except (http.client.InvalidURL, ValueError) as e: | |
413 | # Validation errors | |
414 | # http.client.HTTPConnection raises ValueError in some validation cases | |
415 | # such as if request method contains illegal control characters [1] | |
416 | # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256 | |
417 | raise RequestError(cause=e) from e | |
418 | except Exception as e: | |
419 | handle_response_read_exceptions(e) | |
420 | raise # unexpected | |
421 | ||
422 | return UrllibResponseAdapter(res) |