]> jfr.im git - yt-dlp.git/blob - yt_dlp/networking/_urllib.py
[networking] Add module (#2861)
[yt-dlp.git] / yt_dlp / networking / _urllib.py
1 import functools
2 import gzip
3 import http.client
4 import io
5 import socket
6 import ssl
7 import urllib.error
8 import urllib.parse
9 import urllib.request
10 import urllib.response
11 import zlib
12
13 from ._helper import (
14 add_accept_encoding_header,
15 get_redirect_method,
16 make_socks_proxy_opts,
17 )
18 from ..dependencies import brotli
19 from ..socks import sockssocket
20 from ..utils import escape_url, update_url_query
21 from ..utils.networking import clean_headers, std_headers
22
23 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
24
25 if brotli:
26 SUPPORTED_ENCODINGS.append('br')
27
28
29 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
30 hc = http_class(*args, **kwargs)
31 source_address = ydl_handler._params.get('source_address')
32
33 if source_address is not None:
34 # This is to workaround _create_connection() from socket where it will try all
35 # address data from getaddrinfo() including IPv6. This filters the result from
36 # getaddrinfo() based on the source_address value.
37 # This is based on the cpython socket.create_connection() function.
38 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
39 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
40 host, port = address
41 err = None
42 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
43 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
44 ip_addrs = [addr for addr in addrs if addr[0] == af]
45 if addrs and not ip_addrs:
46 ip_version = 'v4' if af == socket.AF_INET else 'v6'
47 raise OSError(
48 "No remote IP%s addresses available for connect, can't use '%s' as source address"
49 % (ip_version, source_address[0]))
50 for res in ip_addrs:
51 af, socktype, proto, canonname, sa = res
52 sock = None
53 try:
54 sock = socket.socket(af, socktype, proto)
55 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
56 sock.settimeout(timeout)
57 sock.bind(source_address)
58 sock.connect(sa)
59 err = None # Explicitly break reference cycle
60 return sock
61 except OSError as _:
62 err = _
63 if sock is not None:
64 sock.close()
65 if err is not None:
66 raise err
67 else:
68 raise OSError('getaddrinfo returns an empty list')
69 if hasattr(hc, '_create_connection'):
70 hc._create_connection = _create_connection
71 hc.source_address = (source_address, 0)
72
73 return hc
74
75
76 class HTTPHandler(urllib.request.HTTPHandler):
77 """Handler for HTTP requests and responses.
78
79 This class, when installed with an OpenerDirector, automatically adds
80 the standard headers to every HTTP request and handles gzipped, deflated and
81 brotli responses from web servers.
82
83 Part of this code was copied from:
84
85 http://techknack.net/python-urllib2-handlers/
86
87 Andrew Rowls, the author of that code, agreed to release it to the
88 public domain.
89 """
90
91 def __init__(self, params, *args, **kwargs):
92 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
93 self._params = params
94
95 def http_open(self, req):
96 conn_class = http.client.HTTPConnection
97
98 socks_proxy = req.headers.get('Ytdl-socks-proxy')
99 if socks_proxy:
100 conn_class = make_socks_conn_class(conn_class, socks_proxy)
101 del req.headers['Ytdl-socks-proxy']
102
103 return self.do_open(functools.partial(
104 _create_http_connection, self, conn_class, False),
105 req)
106
107 @staticmethod
108 def deflate(data):
109 if not data:
110 return data
111 try:
112 return zlib.decompress(data, -zlib.MAX_WBITS)
113 except zlib.error:
114 return zlib.decompress(data)
115
116 @staticmethod
117 def brotli(data):
118 if not data:
119 return data
120 return brotli.decompress(data)
121
122 @staticmethod
123 def gz(data):
124 gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
125 try:
126 return gz.read()
127 except OSError as original_oserror:
128 # There may be junk add the end of the file
129 # See http://stackoverflow.com/q/4928560/35070 for details
130 for i in range(1, 1024):
131 try:
132 gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
133 return gz.read()
134 except OSError:
135 continue
136 else:
137 raise original_oserror
138
139 def http_request(self, req):
140 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
141 # always respected by websites, some tend to give out URLs with non percent-encoded
142 # non-ASCII characters (see telemb.py, ard.py [#3412])
143 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
144 # To work around aforementioned issue we will replace request's original URL with
145 # percent-encoded one
146 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
147 # the code of this workaround has been moved here from YoutubeDL.urlopen()
148 url = req.get_full_url()
149 url_escaped = escape_url(url)
150
151 # Substitute URL if any change after escaping
152 if url != url_escaped:
153 req = update_Request(req, url=url_escaped)
154
155 for h, v in self._params.get('http_headers', std_headers).items():
156 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
157 # The dict keys are capitalized because of this bug by urllib
158 if h.capitalize() not in req.headers:
159 req.add_header(h, v)
160
161 clean_headers(req.headers)
162 add_accept_encoding_header(req.headers, SUPPORTED_ENCODINGS)
163 return super().do_request_(req)
164
165 def http_response(self, req, resp):
166 old_resp = resp
167
168 # Content-Encoding header lists the encodings in order that they were applied [1].
169 # To decompress, we simply do the reverse.
170 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
171 decoded_response = None
172 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
173 if encoding == 'gzip':
174 decoded_response = self.gz(decoded_response or resp.read())
175 elif encoding == 'deflate':
176 decoded_response = self.deflate(decoded_response or resp.read())
177 elif encoding == 'br' and brotli:
178 decoded_response = self.brotli(decoded_response or resp.read())
179
180 if decoded_response is not None:
181 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
182 resp.msg = old_resp.msg
183 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
184 # https://github.com/ytdl-org/youtube-dl/issues/6457).
185 if 300 <= resp.code < 400:
186 location = resp.headers.get('Location')
187 if location:
188 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
189 location = location.encode('iso-8859-1').decode()
190 location_escaped = escape_url(location)
191 if location != location_escaped:
192 del resp.headers['Location']
193 resp.headers['Location'] = location_escaped
194 return resp
195
196 https_request = http_request
197 https_response = http_response
198
199
200 def make_socks_conn_class(base_class, socks_proxy):
201 assert issubclass(base_class, (
202 http.client.HTTPConnection, http.client.HTTPSConnection))
203
204 proxy_args = make_socks_proxy_opts(socks_proxy)
205
206 class SocksConnection(base_class):
207 def connect(self):
208 self.sock = sockssocket()
209 self.sock.setproxy(**proxy_args)
210 if isinstance(self.timeout, (int, float)):
211 self.sock.settimeout(self.timeout)
212 self.sock.connect((self.host, self.port))
213
214 if isinstance(self, http.client.HTTPSConnection):
215 if hasattr(self, '_context'): # Python > 2.6
216 self.sock = self._context.wrap_socket(
217 self.sock, server_hostname=self.host)
218 else:
219 self.sock = ssl.wrap_socket(self.sock)
220
221 return SocksConnection
222
223
224 class RedirectHandler(urllib.request.HTTPRedirectHandler):
225 """YoutubeDL redirect handler
226
227 The code is based on HTTPRedirectHandler implementation from CPython [1].
228
229 This redirect handler fixes and improves the logic to better align with RFC7261
230 and what browsers tend to do [2][3]
231
232 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
233 2. https://datatracker.ietf.org/doc/html/rfc7231
234 3. https://github.com/python/cpython/issues/91306
235 """
236
237 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
238
239 def redirect_request(self, req, fp, code, msg, headers, newurl):
240 if code not in (301, 302, 303, 307, 308):
241 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
242
243 new_data = req.data
244
245 # Technically the Cookie header should be in unredirected_hdrs,
246 # however in practice some may set it in normal headers anyway.
247 # We will remove it here to prevent any leaks.
248 remove_headers = ['Cookie']
249
250 new_method = get_redirect_method(req.get_method(), code)
251 # only remove payload if method changed (e.g. POST to GET)
252 if new_method != req.get_method():
253 new_data = None
254 remove_headers.extend(['Content-Length', 'Content-Type'])
255
256 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
257
258 return urllib.request.Request(
259 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
260 unverifiable=True, method=new_method, data=new_data)
261
262
263 class ProxyHandler(urllib.request.ProxyHandler):
264 def __init__(self, proxies=None):
265 # Set default handlers
266 for type in ('http', 'https'):
267 setattr(self, '%s_open' % type,
268 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
269 meth(r, proxy, type))
270 urllib.request.ProxyHandler.__init__(self, proxies)
271
272 def proxy_open(self, req, proxy, type):
273 req_proxy = req.headers.get('Ytdl-request-proxy')
274 if req_proxy is not None:
275 proxy = req_proxy
276 del req.headers['Ytdl-request-proxy']
277
278 if proxy == '__noproxy__':
279 return None # No Proxy
280 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
281 req.add_header('Ytdl-socks-proxy', proxy)
282 # yt-dlp's http/https handlers do wrapping the socket with socks
283 return None
284 return urllib.request.ProxyHandler.proxy_open(
285 self, req, proxy, type)
286
287
288 class PUTRequest(urllib.request.Request):
289 def get_method(self):
290 return 'PUT'
291
292
293 class HEADRequest(urllib.request.Request):
294 def get_method(self):
295 return 'HEAD'
296
297
298 def update_Request(req, url=None, data=None, headers=None, query=None):
299 req_headers = req.headers.copy()
300 req_headers.update(headers or {})
301 req_data = data or req.data
302 req_url = update_url_query(url or req.get_full_url(), query)
303 req_get_method = req.get_method()
304 if req_get_method == 'HEAD':
305 req_type = HEADRequest
306 elif req_get_method == 'PUT':
307 req_type = PUTRequest
308 else:
309 req_type = urllib.request.Request
310 new_req = req_type(
311 req_url, data=req_data, headers=req_headers,
312 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
313 if hasattr(req, 'timeout'):
314 new_req.timeout = req.timeout
315 return new_req