1 from __future__
import annotations
11 import urllib
.response
13 from urllib
.request
import (
18 HTTPDefaultErrorHandler
,
23 from ._helper
import (
25 add_accept_encoding_header
,
27 make_socks_proxy_opts
,
30 from .common
import Features
, RequestHandler
, Response
, register_rh
31 from .exceptions
import (
32 CertificateVerifyError
,
40 from ..dependencies
import brotli
41 from ..socks
import ProxyError
as SocksProxyError
42 from ..socks
import sockssocket
43 from ..utils
import update_url_query
44 from ..utils
.networking
import normalize_url
46 SUPPORTED_ENCODINGS
= ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS
= [zlib
.error
, OSError]
50 SUPPORTED_ENCODINGS
.append('br')
51 CONTENT_DECODE_ERRORS
.append(brotli
.error
)
54 def _create_http_connection(http_class
, source_address
, *args
, **kwargs
):
55 hc
= http_class(*args
, **kwargs
)
57 if source_address
is not None:
58 # This is to workaround _create_connection() from socket where it will try all
59 # address data from getaddrinfo() including IPv6. This filters the result from
60 # getaddrinfo() based on the source_address value.
61 # This is based on the cpython socket.create_connection() function.
62 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
63 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
66 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
67 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
68 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
69 if addrs
and not ip_addrs
:
70 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
72 "No remote IP%s addresses available for connect, can't use '%s' as source address"
73 % (ip_version
, source_address
[0]))
75 af
, socktype
, proto
, canonname
, sa
= res
78 sock
= socket
.socket(af
, socktype
, proto
)
79 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
80 sock
.settimeout(timeout
)
81 sock
.bind(source_address
)
83 err
= None # Explicitly break reference cycle
92 raise OSError('getaddrinfo returns an empty list')
93 if hasattr(hc
, '_create_connection'):
94 hc
._create
_connection
= _create_connection
95 hc
.source_address
= (source_address
, 0)
100 class HTTPHandler(urllib
.request
.AbstractHTTPHandler
):
101 """Handler for HTTP requests and responses.
103 This class, when installed with an OpenerDirector, automatically adds
104 the standard headers to every HTTP request and handles gzipped, deflated and
105 brotli responses from web servers.
107 Part of this code was copied from:
109 http://techknack.net/python-urllib2-handlers/
111 Andrew Rowls, the author of that code, agreed to release it to the
115 def __init__(self
, context
=None, source_address
=None, *args
, **kwargs
):
116 super().__init
__(*args
, **kwargs
)
117 self
._source
_address
= source_address
118 self
._context
= context
121 def _make_conn_class(base
, req
):
123 socks_proxy
= req
.headers
.pop('Ytdl-socks-proxy', None)
125 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
128 def http_open(self
, req
):
129 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPConnection
, req
)
130 return self
.do_open(functools
.partial(
131 _create_http_connection
, conn_class
, self
._source
_address
), req
)
133 def https_open(self
, req
):
134 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPSConnection
, req
)
137 _create_http_connection
, conn_class
, self
._source
_address
),
138 req
, context
=self
._context
)
145 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
147 return zlib
.decompress(data
)
153 return brotli
.decompress(data
)
157 # There may be junk added the end of the file
158 # We ignore it by only ever decoding a single gzip payload
161 return zlib
.decompress(data
, wbits
=zlib
.MAX_WBITS |
16)
163 def http_request(self
, req
):
164 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
165 # always respected by websites, some tend to give out URLs with non percent-encoded
166 # non-ASCII characters (see telemb.py, ard.py [#3412])
167 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
168 # To work around aforementioned issue we will replace request's original URL with
169 # percent-encoded one
170 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
171 # the code of this workaround has been moved here from YoutubeDL.urlopen()
172 url
= req
.get_full_url()
173 url_escaped
= normalize_url(url
)
175 # Substitute URL if any change after escaping
176 if url
!= url_escaped
:
177 req
= update_Request(req
, url
=url_escaped
)
179 return super().do_request_(req
)
181 def http_response(self
, req
, resp
):
184 # Content-Encoding header lists the encodings in order that they were applied [1].
185 # To decompress, we simply do the reverse.
186 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
187 decoded_response
= None
188 for encoding
in (e
.strip() for e
in reversed(resp
.headers
.get('Content-encoding', '').split(','))):
189 if encoding
== 'gzip':
190 decoded_response
= self
.gz(decoded_response
or resp
.read())
191 elif encoding
== 'deflate':
192 decoded_response
= self
.deflate(decoded_response
or resp
.read())
193 elif encoding
== 'br' and brotli
:
194 decoded_response
= self
.brotli(decoded_response
or resp
.read())
196 if decoded_response
is not None:
197 resp
= urllib
.request
.addinfourl(io
.BytesIO(decoded_response
), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
198 resp
.msg
= old_resp
.msg
199 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
200 # https://github.com/ytdl-org/youtube-dl/issues/6457).
201 if 300 <= resp
.code
< 400:
202 location
= resp
.headers
.get('Location')
204 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
205 location
= location
.encode('iso-8859-1').decode()
206 location_escaped
= normalize_url(location
)
207 if location
!= location_escaped
:
208 del resp
.headers
['Location']
209 resp
.headers
['Location'] = location_escaped
212 https_request
= http_request
213 https_response
= http_response
216 def make_socks_conn_class(base_class
, socks_proxy
):
217 assert issubclass(base_class
, (
218 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
220 proxy_args
= make_socks_proxy_opts(socks_proxy
)
222 class SocksConnection(base_class
):
224 self
.sock
= sockssocket()
225 self
.sock
.setproxy(**proxy_args
)
226 if type(self
.timeout
) in (int, float): # noqa: E721
227 self
.sock
.settimeout(self
.timeout
)
228 self
.sock
.connect((self
.host
, self
.port
))
230 if isinstance(self
, http
.client
.HTTPSConnection
):
231 self
.sock
= self
._context
.wrap_socket(self
.sock
, server_hostname
=self
.host
)
233 return SocksConnection
236 class RedirectHandler(urllib
.request
.HTTPRedirectHandler
):
237 """YoutubeDL redirect handler
239 The code is based on HTTPRedirectHandler implementation from CPython [1].
241 This redirect handler fixes and improves the logic to better align with RFC7261
242 and what browsers tend to do [2][3]
244 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
245 2. https://datatracker.ietf.org/doc/html/rfc7231
246 3. https://github.com/python/cpython/issues/91306
249 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
251 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
252 if code
not in (301, 302, 303, 307, 308):
253 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
257 # Technically the Cookie header should be in unredirected_hdrs,
258 # however in practice some may set it in normal headers anyway.
259 # We will remove it here to prevent any leaks.
260 remove_headers
= ['Cookie']
262 new_method
= get_redirect_method(req
.get_method(), code
)
263 # only remove payload if method changed (e.g. POST to GET)
264 if new_method
!= req
.get_method():
266 remove_headers
.extend(['Content-Length', 'Content-Type'])
268 new_headers
= {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
270 return urllib
.request
.Request(
271 newurl
, headers
=new_headers
, origin_req_host
=req
.origin_req_host
,
272 unverifiable
=True, method
=new_method
, data
=new_data
)
275 class ProxyHandler(urllib
.request
.BaseHandler
):
278 def __init__(self
, proxies
=None):
279 self
.proxies
= proxies
280 # Set default handlers
281 for type in ('http', 'https', 'ftp'):
282 setattr(self
, '%s_open' % type, lambda r
, meth
=self
.proxy_open
: meth(r
))
284 def proxy_open(self
, req
):
285 proxy
= select_proxy(req
.get_full_url(), self
.proxies
)
288 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
289 req
.add_header('Ytdl-socks-proxy', proxy
)
290 # yt-dlp's http/https handlers do wrapping the socket with socks
292 return urllib
.request
.ProxyHandler
.proxy_open(
293 self
, req
, proxy
, None)
296 class PUTRequest(urllib
.request
.Request
):
297 def get_method(self
):
301 class HEADRequest(urllib
.request
.Request
):
302 def get_method(self
):
306 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
307 req_headers
= req
.headers
.copy()
308 req_headers
.update(headers
or {})
309 req_data
= data
if data
is not None else req
.data
310 req_url
= update_url_query(url
or req
.get_full_url(), query
)
311 req_get_method
= req
.get_method()
312 if req_get_method
== 'HEAD':
313 req_type
= HEADRequest
314 elif req_get_method
== 'PUT':
315 req_type
= PUTRequest
317 req_type
= urllib
.request
.Request
319 req_url
, data
=req_data
, headers
=req_headers
,
320 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
321 if hasattr(req
, 'timeout'):
322 new_req
.timeout
= req
.timeout
326 class UrllibResponseAdapter(Response
):
328 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
331 def __init__(self
, res
: http
.client
.HTTPResponse | urllib
.response
.addinfourl
):
332 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
333 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
334 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
335 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
337 fp
=res
, headers
=res
.headers
, url
=res
.url
,
338 status
=getattr(res
, 'status', None) or res
.getcode(), reason
=getattr(res
, 'reason', None))
340 def read(self
, amt
=None):
342 return self
.fp
.read(amt
)
343 except Exception as e
:
344 handle_response_read_exceptions(e
)
348 def handle_sslerror(e
: ssl
.SSLError
):
349 if not isinstance(e
, ssl
.SSLError
):
351 if isinstance(e
, ssl
.SSLCertVerificationError
):
352 raise CertificateVerifyError(cause
=e
) from e
353 raise SSLError(cause
=e
) from e
356 def handle_response_read_exceptions(e
):
357 if isinstance(e
, http
.client
.IncompleteRead
):
358 raise IncompleteRead(partial
=e
.partial
, cause
=e
, expected
=e
.expected
) from e
359 elif isinstance(e
, ssl
.SSLError
):
361 elif isinstance(e
, (OSError, EOFError, http
.client
.HTTPException
, *CONTENT_DECODE_ERRORS
)):
362 # OSErrors raised here should mostly be network related
363 raise TransportError(cause
=e
) from e
367 class UrllibRH(RequestHandler
, InstanceStoreMixin
):
368 _SUPPORTED_URL_SCHEMES
= ('http', 'https', 'data', 'ftp')
369 _SUPPORTED_PROXY_SCHEMES
= ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
370 _SUPPORTED_FEATURES
= (Features
.NO_PROXY
, Features
.ALL_PROXY
)
373 def __init__(self
, *, enable_file_urls
: bool = False, **kwargs
):
374 super().__init
__(**kwargs
)
375 self
.enable_file_urls
= enable_file_urls
376 if self
.enable_file_urls
:
377 self
._SUPPORTED
_URL
_SCHEMES
= (*self
._SUPPORTED
_URL
_SCHEMES
, 'file')
379 def _check_extensions(self
, extensions
):
380 super()._check
_extensions
(extensions
)
381 extensions
.pop('cookiejar', None)
382 extensions
.pop('timeout', None)
384 def _create_instance(self
, proxies
, cookiejar
):
385 opener
= urllib
.request
.OpenerDirector()
387 ProxyHandler(proxies
),
389 debuglevel
=int(bool(self
.verbose
)),
390 context
=self
._make
_sslcontext
(),
391 source_address
=self
.source_address
),
392 HTTPCookieProcessor(cookiejar
),
395 HTTPDefaultErrorHandler(),
397 HTTPErrorProcessor(),
401 if self
.enable_file_urls
:
402 handlers
.append(FileHandler())
404 for handler
in handlers
:
405 opener
.add_handler(handler
)
407 # Delete the default user-agent header, which would otherwise apply in
408 # cases where our custom HTTP handler doesn't come into play
409 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
410 opener
.addheaders
= []
413 def _send(self
, request
):
414 headers
= self
._merge
_headers
(request
.headers
)
415 add_accept_encoding_header(headers
, SUPPORTED_ENCODINGS
)
416 urllib_req
= urllib
.request
.Request(
419 headers
=dict(headers
),
420 method
=request
.method
423 opener
= self
._get
_instance
(
424 proxies
=request
.proxies
or self
.proxies
,
425 cookiejar
=request
.extensions
.get('cookiejar') or self
.cookiejar
428 res
= opener
.open(urllib_req
, timeout
=float(request
.extensions
.get('timeout') or self
.timeout
))
429 except urllib
.error
.HTTPError
as e
:
430 if isinstance(e
.fp
, (http
.client
.HTTPResponse
, urllib
.response
.addinfourl
)):
431 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
432 e
._closer
.file = None
433 raise HTTPError(UrllibResponseAdapter(e
.fp
), redirect_loop
='redirect error' in str(e
)) from e
435 except urllib
.error
.URLError
as e
:
436 cause
= e
.reason
# NOTE: cause may be a string
439 if 'tunnel connection failed' in str(cause
).lower() or isinstance(cause
, SocksProxyError
):
440 raise ProxyError(cause
=e
) from e
442 handle_response_read_exceptions(cause
)
443 raise TransportError(cause
=e
) from e
444 except (http
.client
.InvalidURL
, ValueError) as e
:
446 # http.client.HTTPConnection raises ValueError in some validation cases
447 # such as if request method contains illegal control characters [1]
448 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
449 raise RequestError(cause
=e
) from e
450 except Exception as e
:
451 handle_response_read_exceptions(e
)
454 return UrllibResponseAdapter(res
)