10 import urllib
.response
13 from ._helper
import (
14 add_accept_encoding_header
,
16 make_socks_proxy_opts
,
18 from ..dependencies
import brotli
19 from ..socks
import sockssocket
20 from ..utils
import escape_url
, update_url_query
21 from ..utils
.networking
import clean_headers
, std_headers
23 SUPPORTED_ENCODINGS
= ['gzip', 'deflate']
26 SUPPORTED_ENCODINGS
.append('br')
29 def _create_http_connection(ydl_handler
, http_class
, is_https
, *args
, **kwargs
):
30 hc
= http_class(*args
, **kwargs
)
31 source_address
= ydl_handler
._params
.get('source_address')
33 if source_address
is not None:
34 # This is to workaround _create_connection() from socket where it will try all
35 # address data from getaddrinfo() including IPv6. This filters the result from
36 # getaddrinfo() based on the source_address value.
37 # This is based on the cpython socket.create_connection() function.
38 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
39 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
42 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
43 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
44 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
45 if addrs
and not ip_addrs
:
46 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
48 "No remote IP%s addresses available for connect, can't use '%s' as source address"
49 % (ip_version
, source_address
[0]))
51 af
, socktype
, proto
, canonname
, sa
= res
54 sock
= socket
.socket(af
, socktype
, proto
)
55 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
56 sock
.settimeout(timeout
)
57 sock
.bind(source_address
)
59 err
= None # Explicitly break reference cycle
68 raise OSError('getaddrinfo returns an empty list')
69 if hasattr(hc
, '_create_connection'):
70 hc
._create
_connection
= _create_connection
71 hc
.source_address
= (source_address
, 0)
76 class HTTPHandler(urllib
.request
.HTTPHandler
):
77 """Handler for HTTP requests and responses.
79 This class, when installed with an OpenerDirector, automatically adds
80 the standard headers to every HTTP request and handles gzipped, deflated and
81 brotli responses from web servers.
83 Part of this code was copied from:
85 http://techknack.net/python-urllib2-handlers/
87 Andrew Rowls, the author of that code, agreed to release it to the
91 def __init__(self
, params
, *args
, **kwargs
):
92 urllib
.request
.HTTPHandler
.__init
__(self
, *args
, **kwargs
)
95 def http_open(self
, req
):
96 conn_class
= http
.client
.HTTPConnection
98 socks_proxy
= req
.headers
.get('Ytdl-socks-proxy')
100 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
101 del req
.headers
['Ytdl-socks-proxy']
103 return self
.do_open(functools
.partial(
104 _create_http_connection
, self
, conn_class
, False),
112 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
114 return zlib
.decompress(data
)
120 return brotli
.decompress(data
)
124 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(data
), mode
='rb')
127 except OSError as original_oserror
:
128 # There may be junk add the end of the file
129 # See http://stackoverflow.com/q/4928560/35070 for details
130 for i
in range(1, 1024):
132 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(data
[:-i
]), mode
='rb')
137 raise original_oserror
139 def http_request(self
, req
):
140 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
141 # always respected by websites, some tend to give out URLs with non percent-encoded
142 # non-ASCII characters (see telemb.py, ard.py [#3412])
143 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
144 # To work around aforementioned issue we will replace request's original URL with
145 # percent-encoded one
146 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
147 # the code of this workaround has been moved here from YoutubeDL.urlopen()
148 url
= req
.get_full_url()
149 url_escaped
= escape_url(url
)
151 # Substitute URL if any change after escaping
152 if url
!= url_escaped
:
153 req
= update_Request(req
, url
=url_escaped
)
155 for h
, v
in self
._params
.get('http_headers', std_headers
).items():
156 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
157 # The dict keys are capitalized because of this bug by urllib
158 if h
.capitalize() not in req
.headers
:
161 clean_headers(req
.headers
)
162 add_accept_encoding_header(req
.headers
, SUPPORTED_ENCODINGS
)
163 return super().do_request_(req
)
165 def http_response(self
, req
, resp
):
168 # Content-Encoding header lists the encodings in order that they were applied [1].
169 # To decompress, we simply do the reverse.
170 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
171 decoded_response
= None
172 for encoding
in (e
.strip() for e
in reversed(resp
.headers
.get('Content-encoding', '').split(','))):
173 if encoding
== 'gzip':
174 decoded_response
= self
.gz(decoded_response
or resp
.read())
175 elif encoding
== 'deflate':
176 decoded_response
= self
.deflate(decoded_response
or resp
.read())
177 elif encoding
== 'br' and brotli
:
178 decoded_response
= self
.brotli(decoded_response
or resp
.read())
180 if decoded_response
is not None:
181 resp
= urllib
.request
.addinfourl(io
.BytesIO(decoded_response
), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
182 resp
.msg
= old_resp
.msg
183 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
184 # https://github.com/ytdl-org/youtube-dl/issues/6457).
185 if 300 <= resp
.code
< 400:
186 location
= resp
.headers
.get('Location')
188 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
189 location
= location
.encode('iso-8859-1').decode()
190 location_escaped
= escape_url(location
)
191 if location
!= location_escaped
:
192 del resp
.headers
['Location']
193 resp
.headers
['Location'] = location_escaped
196 https_request
= http_request
197 https_response
= http_response
200 def make_socks_conn_class(base_class
, socks_proxy
):
201 assert issubclass(base_class
, (
202 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
204 proxy_args
= make_socks_proxy_opts(socks_proxy
)
206 class SocksConnection(base_class
):
208 self
.sock
= sockssocket()
209 self
.sock
.setproxy(**proxy_args
)
210 if isinstance(self
.timeout
, (int, float)):
211 self
.sock
.settimeout(self
.timeout
)
212 self
.sock
.connect((self
.host
, self
.port
))
214 if isinstance(self
, http
.client
.HTTPSConnection
):
215 if hasattr(self
, '_context'): # Python > 2.6
216 self
.sock
= self
._context
.wrap_socket(
217 self
.sock
, server_hostname
=self
.host
)
219 self
.sock
= ssl
.wrap_socket(self
.sock
)
221 return SocksConnection
224 class RedirectHandler(urllib
.request
.HTTPRedirectHandler
):
225 """YoutubeDL redirect handler
227 The code is based on HTTPRedirectHandler implementation from CPython [1].
229 This redirect handler fixes and improves the logic to better align with RFC7261
230 and what browsers tend to do [2][3]
232 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
233 2. https://datatracker.ietf.org/doc/html/rfc7231
234 3. https://github.com/python/cpython/issues/91306
237 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
239 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
240 if code
not in (301, 302, 303, 307, 308):
241 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
245 # Technically the Cookie header should be in unredirected_hdrs,
246 # however in practice some may set it in normal headers anyway.
247 # We will remove it here to prevent any leaks.
248 remove_headers
= ['Cookie']
250 new_method
= get_redirect_method(req
.get_method(), code
)
251 # only remove payload if method changed (e.g. POST to GET)
252 if new_method
!= req
.get_method():
254 remove_headers
.extend(['Content-Length', 'Content-Type'])
256 new_headers
= {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
258 return urllib
.request
.Request(
259 newurl
, headers
=new_headers
, origin_req_host
=req
.origin_req_host
,
260 unverifiable
=True, method
=new_method
, data
=new_data
)
263 class ProxyHandler(urllib
.request
.ProxyHandler
):
264 def __init__(self
, proxies
=None):
265 # Set default handlers
266 for type in ('http', 'https'):
267 setattr(self
, '%s_open' % type,
268 lambda r
, proxy
='__noproxy__', type=type, meth
=self
.proxy_open
:
269 meth(r
, proxy
, type))
270 urllib
.request
.ProxyHandler
.__init
__(self
, proxies
)
272 def proxy_open(self
, req
, proxy
, type):
273 req_proxy
= req
.headers
.get('Ytdl-request-proxy')
274 if req_proxy
is not None:
276 del req
.headers
['Ytdl-request-proxy']
278 if proxy
== '__noproxy__':
279 return None # No Proxy
280 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
281 req
.add_header('Ytdl-socks-proxy', proxy
)
282 # yt-dlp's http/https handlers do wrapping the socket with socks
284 return urllib
.request
.ProxyHandler
.proxy_open(
285 self
, req
, proxy
, type)
288 class PUTRequest(urllib
.request
.Request
):
289 def get_method(self
):
293 class HEADRequest(urllib
.request
.Request
):
294 def get_method(self
):
298 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
299 req_headers
= req
.headers
.copy()
300 req_headers
.update(headers
or {})
301 req_data
= data
or req
.data
302 req_url
= update_url_query(url
or req
.get_full_url(), query
)
303 req_get_method
= req
.get_method()
304 if req_get_method
== 'HEAD':
305 req_type
= HEADRequest
306 elif req_get_method
== 'PUT':
307 req_type
= PUTRequest
309 req_type
= urllib
.request
.Request
311 req_url
, data
=req_data
, headers
=req_headers
,
312 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
313 if hasattr(req
, 'timeout'):
314 new_req
.timeout
= req
.timeout