]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_urllib.py
[networking] Remove dot segments during URL normalization (#7662)
[yt-dlp.git] / yt_dlp / networking / _urllib.py
1 from __future__ import annotations
2
3 import functools
4 import gzip
5 import http.client
6 import io
7 import socket
8 import ssl
9 import urllib.error
10 import urllib.parse
11 import urllib.request
12 import urllib.response
13 import zlib
14 from urllib.request import (
15 DataHandler,
16 FileHandler,
17 FTPHandler,
18 HTTPCookieProcessor,
19 HTTPDefaultErrorHandler,
20 HTTPErrorProcessor,
21 UnknownHandler,
22 )
23
24 from ._helper import (
25 InstanceStoreMixin,
26 add_accept_encoding_header,
27 get_redirect_method,
28 make_socks_proxy_opts,
29 select_proxy,
30 )
31 from .common import Features, RequestHandler, Response, register_rh
32 from .exceptions import (
33 CertificateVerifyError,
34 HTTPError,
35 IncompleteRead,
36 ProxyError,
37 RequestError,
38 SSLError,
39 TransportError,
40 )
41 from ..dependencies import brotli
42 from ..socks import ProxyError as SocksProxyError
43 from ..socks import sockssocket
44 from ..utils import update_url_query
45 from ..utils.networking import normalize_url
46
47 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
48 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
49
50 if brotli:
51 SUPPORTED_ENCODINGS.append('br')
52 CONTENT_DECODE_ERRORS.append(brotli.error)
53
54
55 def _create_http_connection(http_class, source_address, *args, **kwargs):
56 hc = http_class(*args, **kwargs)
57
58 if source_address is not None:
59 # This is to workaround _create_connection() from socket where it will try all
60 # address data from getaddrinfo() including IPv6. This filters the result from
61 # getaddrinfo() based on the source_address value.
62 # This is based on the cpython socket.create_connection() function.
63 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
64 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
65 host, port = address
66 err = None
67 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
68 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
69 ip_addrs = [addr for addr in addrs if addr[0] == af]
70 if addrs and not ip_addrs:
71 ip_version = 'v4' if af == socket.AF_INET else 'v6'
72 raise OSError(
73 "No remote IP%s addresses available for connect, can't use '%s' as source address"
74 % (ip_version, source_address[0]))
75 for res in ip_addrs:
76 af, socktype, proto, canonname, sa = res
77 sock = None
78 try:
79 sock = socket.socket(af, socktype, proto)
80 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
81 sock.settimeout(timeout)
82 sock.bind(source_address)
83 sock.connect(sa)
84 err = None # Explicitly break reference cycle
85 return sock
86 except OSError as _:
87 err = _
88 if sock is not None:
89 sock.close()
90 if err is not None:
91 raise err
92 else:
93 raise OSError('getaddrinfo returns an empty list')
94 if hasattr(hc, '_create_connection'):
95 hc._create_connection = _create_connection
96 hc.source_address = (source_address, 0)
97
98 return hc
99
100
101 class HTTPHandler(urllib.request.AbstractHTTPHandler):
102 """Handler for HTTP requests and responses.
103
104 This class, when installed with an OpenerDirector, automatically adds
105 the standard headers to every HTTP request and handles gzipped, deflated and
106 brotli responses from web servers.
107
108 Part of this code was copied from:
109
110 http://techknack.net/python-urllib2-handlers/
111
112 Andrew Rowls, the author of that code, agreed to release it to the
113 public domain.
114 """
115
116 def __init__(self, context=None, source_address=None, *args, **kwargs):
117 super().__init__(*args, **kwargs)
118 self._source_address = source_address
119 self._context = context
120
121 @staticmethod
122 def _make_conn_class(base, req):
123 conn_class = base
124 socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
125 if socks_proxy:
126 conn_class = make_socks_conn_class(conn_class, socks_proxy)
127 return conn_class
128
129 def http_open(self, req):
130 conn_class = self._make_conn_class(http.client.HTTPConnection, req)
131 return self.do_open(functools.partial(
132 _create_http_connection, conn_class, self._source_address), req)
133
134 def https_open(self, req):
135 conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
136 return self.do_open(
137 functools.partial(
138 _create_http_connection, conn_class, self._source_address),
139 req, context=self._context)
140
141 @staticmethod
142 def deflate(data):
143 if not data:
144 return data
145 try:
146 return zlib.decompress(data, -zlib.MAX_WBITS)
147 except zlib.error:
148 return zlib.decompress(data)
149
150 @staticmethod
151 def brotli(data):
152 if not data:
153 return data
154 return brotli.decompress(data)
155
156 @staticmethod
157 def gz(data):
158 gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
159 try:
160 return gz.read()
161 except OSError as original_oserror:
162 # There may be junk add the end of the file
163 # See http://stackoverflow.com/q/4928560/35070 for details
164 for i in range(1, 1024):
165 try:
166 gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
167 return gz.read()
168 except OSError:
169 continue
170 else:
171 raise original_oserror
172
173 def http_request(self, req):
174 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
175 # always respected by websites, some tend to give out URLs with non percent-encoded
176 # non-ASCII characters (see telemb.py, ard.py [#3412])
177 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
178 # To work around aforementioned issue we will replace request's original URL with
179 # percent-encoded one
180 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
181 # the code of this workaround has been moved here from YoutubeDL.urlopen()
182 url = req.get_full_url()
183 url_escaped = normalize_url(url)
184
185 # Substitute URL if any change after escaping
186 if url != url_escaped:
187 req = update_Request(req, url=url_escaped)
188
189 return super().do_request_(req)
190
191 def http_response(self, req, resp):
192 old_resp = resp
193
194 # Content-Encoding header lists the encodings in order that they were applied [1].
195 # To decompress, we simply do the reverse.
196 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
197 decoded_response = None
198 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
199 if encoding == 'gzip':
200 decoded_response = self.gz(decoded_response or resp.read())
201 elif encoding == 'deflate':
202 decoded_response = self.deflate(decoded_response or resp.read())
203 elif encoding == 'br' and brotli:
204 decoded_response = self.brotli(decoded_response or resp.read())
205
206 if decoded_response is not None:
207 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
208 resp.msg = old_resp.msg
209 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
210 # https://github.com/ytdl-org/youtube-dl/issues/6457).
211 if 300 <= resp.code < 400:
212 location = resp.headers.get('Location')
213 if location:
214 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
215 location = location.encode('iso-8859-1').decode()
216 location_escaped = normalize_url(location)
217 if location != location_escaped:
218 del resp.headers['Location']
219 resp.headers['Location'] = location_escaped
220 return resp
221
222 https_request = http_request
223 https_response = http_response
224
225
226 def make_socks_conn_class(base_class, socks_proxy):
227 assert issubclass(base_class, (
228 http.client.HTTPConnection, http.client.HTTPSConnection))
229
230 proxy_args = make_socks_proxy_opts(socks_proxy)
231
232 class SocksConnection(base_class):
233 def connect(self):
234 self.sock = sockssocket()
235 self.sock.setproxy(**proxy_args)
236 if type(self.timeout) in (int, float): # noqa: E721
237 self.sock.settimeout(self.timeout)
238 self.sock.connect((self.host, self.port))
239
240 if isinstance(self, http.client.HTTPSConnection):
241 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
242
243 return SocksConnection
244
245
246 class RedirectHandler(urllib.request.HTTPRedirectHandler):
247 """YoutubeDL redirect handler
248
249 The code is based on HTTPRedirectHandler implementation from CPython [1].
250
251 This redirect handler fixes and improves the logic to better align with RFC7261
252 and what browsers tend to do [2][3]
253
254 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
255 2. https://datatracker.ietf.org/doc/html/rfc7231
256 3. https://github.com/python/cpython/issues/91306
257 """
258
259 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
260
261 def redirect_request(self, req, fp, code, msg, headers, newurl):
262 if code not in (301, 302, 303, 307, 308):
263 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
264
265 new_data = req.data
266
267 # Technically the Cookie header should be in unredirected_hdrs,
268 # however in practice some may set it in normal headers anyway.
269 # We will remove it here to prevent any leaks.
270 remove_headers = ['Cookie']
271
272 new_method = get_redirect_method(req.get_method(), code)
273 # only remove payload if method changed (e.g. POST to GET)
274 if new_method != req.get_method():
275 new_data = None
276 remove_headers.extend(['Content-Length', 'Content-Type'])
277
278 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
279
280 return urllib.request.Request(
281 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
282 unverifiable=True, method=new_method, data=new_data)
283
284
285 class ProxyHandler(urllib.request.BaseHandler):
286 handler_order = 100
287
288 def __init__(self, proxies=None):
289 self.proxies = proxies
290 # Set default handlers
291 for type in ('http', 'https', 'ftp'):
292 setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
293
294 def proxy_open(self, req):
295 proxy = select_proxy(req.get_full_url(), self.proxies)
296 if proxy is None:
297 return
298 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
299 req.add_header('Ytdl-socks-proxy', proxy)
300 # yt-dlp's http/https handlers do wrapping the socket with socks
301 return None
302 return urllib.request.ProxyHandler.proxy_open(
303 self, req, proxy, None)
304
305
306 class PUTRequest(urllib.request.Request):
307 def get_method(self):
308 return 'PUT'
309
310
311 class HEADRequest(urllib.request.Request):
312 def get_method(self):
313 return 'HEAD'
314
315
316 def update_Request(req, url=None, data=None, headers=None, query=None):
317 req_headers = req.headers.copy()
318 req_headers.update(headers or {})
319 req_data = data if data is not None else req.data
320 req_url = update_url_query(url or req.get_full_url(), query)
321 req_get_method = req.get_method()
322 if req_get_method == 'HEAD':
323 req_type = HEADRequest
324 elif req_get_method == 'PUT':
325 req_type = PUTRequest
326 else:
327 req_type = urllib.request.Request
328 new_req = req_type(
329 req_url, data=req_data, headers=req_headers,
330 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
331 if hasattr(req, 'timeout'):
332 new_req.timeout = req.timeout
333 return new_req
334
335
336 class UrllibResponseAdapter(Response):
337 """
338 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
339 """
340
341 def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
342 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
343 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
344 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
345 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
346 super().__init__(
347 fp=res, headers=res.headers, url=res.url,
348 status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
349
350 def read(self, amt=None):
351 try:
352 return self.fp.read(amt)
353 except Exception as e:
354 handle_response_read_exceptions(e)
355 raise e
356
357
358 def handle_sslerror(e: ssl.SSLError):
359 if not isinstance(e, ssl.SSLError):
360 return
361 if isinstance(e, ssl.SSLCertVerificationError):
362 raise CertificateVerifyError(cause=e) from e
363 raise SSLError(cause=e) from e
364
365
366 def handle_response_read_exceptions(e):
367 if isinstance(e, http.client.IncompleteRead):
368 raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
369 elif isinstance(e, ssl.SSLError):
370 handle_sslerror(e)
371 elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
372 # OSErrors raised here should mostly be network related
373 raise TransportError(cause=e) from e
374
375
376 @register_rh
377 class UrllibRH(RequestHandler, InstanceStoreMixin):
378 _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
379 _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
380 _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
381 RH_NAME = 'urllib'
382
383 def __init__(self, *, enable_file_urls: bool = False, **kwargs):
384 super().__init__(**kwargs)
385 self.enable_file_urls = enable_file_urls
386 if self.enable_file_urls:
387 self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
388
389 def _check_extensions(self, extensions):
390 super()._check_extensions(extensions)
391 extensions.pop('cookiejar', None)
392 extensions.pop('timeout', None)
393
394 def _create_instance(self, proxies, cookiejar):
395 opener = urllib.request.OpenerDirector()
396 handlers = [
397 ProxyHandler(proxies),
398 HTTPHandler(
399 debuglevel=int(bool(self.verbose)),
400 context=self._make_sslcontext(),
401 source_address=self.source_address),
402 HTTPCookieProcessor(cookiejar),
403 DataHandler(),
404 UnknownHandler(),
405 HTTPDefaultErrorHandler(),
406 FTPHandler(),
407 HTTPErrorProcessor(),
408 RedirectHandler(),
409 ]
410
411 if self.enable_file_urls:
412 handlers.append(FileHandler())
413
414 for handler in handlers:
415 opener.add_handler(handler)
416
417 # Delete the default user-agent header, which would otherwise apply in
418 # cases where our custom HTTP handler doesn't come into play
419 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
420 opener.addheaders = []
421 return opener
422
423 def _send(self, request):
424 headers = self._merge_headers(request.headers)
425 add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
426 urllib_req = urllib.request.Request(
427 url=request.url,
428 data=request.data,
429 headers=dict(headers),
430 method=request.method
431 )
432
433 opener = self._get_instance(
434 proxies=request.proxies or self.proxies,
435 cookiejar=request.extensions.get('cookiejar') or self.cookiejar
436 )
437 try:
438 res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
439 except urllib.error.HTTPError as e:
440 if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
441 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
442 e._closer.file = None
443 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
444 raise # unexpected
445 except urllib.error.URLError as e:
446 cause = e.reason # NOTE: cause may be a string
447
448 # proxy errors
449 if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
450 raise ProxyError(cause=e) from e
451
452 handle_response_read_exceptions(cause)
453 raise TransportError(cause=e) from e
454 except (http.client.InvalidURL, ValueError) as e:
455 # Validation errors
456 # http.client.HTTPConnection raises ValueError in some validation cases
457 # such as if request method contains illegal control characters [1]
458 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
459 raise RequestError(cause=e) from e
460 except Exception as e:
461 handle_response_read_exceptions(e)
462 raise # unexpected
463
464 return UrllibResponseAdapter(res)