1 from __future__
import annotations
11 import urllib
.response
13 from urllib
.request
import (
18 HTTPDefaultErrorHandler
,
23 from ._helper
import (
25 add_accept_encoding_header
,
27 make_socks_proxy_opts
,
30 from .common
import Features
, RequestHandler
, Response
, register_rh
31 from .exceptions
import (
32 CertificateVerifyError
,
40 from ..dependencies
import brotli
41 from ..socks
import ProxyError
as SocksProxyError
42 from ..socks
import sockssocket
43 from ..utils
import update_url_query
44 from ..utils
.networking
import normalize_url
46 SUPPORTED_ENCODINGS
= ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS
= [zlib
.error
, OSError]
50 SUPPORTED_ENCODINGS
.append('br')
51 CONTENT_DECODE_ERRORS
.append(brotli
.error
)
54 def _create_http_connection(http_class
, source_address
, *args
, **kwargs
):
55 hc
= http_class(*args
, **kwargs
)
57 if source_address
is not None:
58 # This is to workaround _create_connection() from socket where it will try all
59 # address data from getaddrinfo() including IPv6. This filters the result from
60 # getaddrinfo() based on the source_address value.
61 # This is based on the cpython socket.create_connection() function.
62 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
63 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
66 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
67 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
68 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
69 if addrs
and not ip_addrs
:
70 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
72 "No remote IP%s addresses available for connect, can't use '%s' as source address"
73 % (ip_version
, source_address
[0]))
75 af
, socktype
, proto
, canonname
, sa
= res
78 sock
= socket
.socket(af
, socktype
, proto
)
79 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
80 sock
.settimeout(timeout
)
81 sock
.bind(source_address
)
83 err
= None # Explicitly break reference cycle
92 raise OSError('getaddrinfo returns an empty list')
93 if hasattr(hc
, '_create_connection'):
94 hc
._create
_connection
= _create_connection
95 hc
.source_address
= (source_address
, 0)
100 class HTTPHandler(urllib
.request
.AbstractHTTPHandler
):
101 """Handler for HTTP requests and responses.
103 This class, when installed with an OpenerDirector, automatically adds
104 the standard headers to every HTTP request and handles gzipped, deflated and
105 brotli responses from web servers.
107 Part of this code was copied from:
109 http://techknack.net/python-urllib2-handlers/
111 Andrew Rowls, the author of that code, agreed to release it to the
115 def __init__(self
, context
=None, source_address
=None, *args
, **kwargs
):
116 super().__init
__(*args
, **kwargs
)
117 self
._source
_address
= source_address
118 self
._context
= context
121 def _make_conn_class(base
, req
):
123 socks_proxy
= req
.headers
.pop('Ytdl-socks-proxy', None)
125 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
128 def http_open(self
, req
):
129 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPConnection
, req
)
130 return self
.do_open(functools
.partial(
131 _create_http_connection
, conn_class
, self
._source
_address
), req
)
133 def https_open(self
, req
):
134 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPSConnection
, req
)
137 _create_http_connection
, conn_class
, self
._source
_address
),
138 req
, context
=self
._context
)
145 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
147 return zlib
.decompress(data
)
153 return brotli
.decompress(data
)
157 # There may be junk added the end of the file
158 # We ignore it by only ever decoding a single gzip payload
159 return zlib
.decompress(data
, wbits
=zlib
.MAX_WBITS |
16)
161 def http_request(self
, req
):
162 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
163 # always respected by websites, some tend to give out URLs with non percent-encoded
164 # non-ASCII characters (see telemb.py, ard.py [#3412])
165 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
166 # To work around aforementioned issue we will replace request's original URL with
167 # percent-encoded one
168 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
169 # the code of this workaround has been moved here from YoutubeDL.urlopen()
170 url
= req
.get_full_url()
171 url_escaped
= normalize_url(url
)
173 # Substitute URL if any change after escaping
174 if url
!= url_escaped
:
175 req
= update_Request(req
, url
=url_escaped
)
177 return super().do_request_(req
)
179 def http_response(self
, req
, resp
):
182 # Content-Encoding header lists the encodings in order that they were applied [1].
183 # To decompress, we simply do the reverse.
184 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
185 decoded_response
= None
186 for encoding
in (e
.strip() for e
in reversed(resp
.headers
.get('Content-encoding', '').split(','))):
187 if encoding
== 'gzip':
188 decoded_response
= self
.gz(decoded_response
or resp
.read())
189 elif encoding
== 'deflate':
190 decoded_response
= self
.deflate(decoded_response
or resp
.read())
191 elif encoding
== 'br' and brotli
:
192 decoded_response
= self
.brotli(decoded_response
or resp
.read())
194 if decoded_response
is not None:
195 resp
= urllib
.request
.addinfourl(io
.BytesIO(decoded_response
), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
196 resp
.msg
= old_resp
.msg
197 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
198 # https://github.com/ytdl-org/youtube-dl/issues/6457).
199 if 300 <= resp
.code
< 400:
200 location
= resp
.headers
.get('Location')
202 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
203 location
= location
.encode('iso-8859-1').decode()
204 location_escaped
= normalize_url(location
)
205 if location
!= location_escaped
:
206 del resp
.headers
['Location']
207 resp
.headers
['Location'] = location_escaped
210 https_request
= http_request
211 https_response
= http_response
214 def make_socks_conn_class(base_class
, socks_proxy
):
215 assert issubclass(base_class
, (
216 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
218 proxy_args
= make_socks_proxy_opts(socks_proxy
)
220 class SocksConnection(base_class
):
222 self
.sock
= sockssocket()
223 self
.sock
.setproxy(**proxy_args
)
224 if type(self
.timeout
) in (int, float): # noqa: E721
225 self
.sock
.settimeout(self
.timeout
)
226 self
.sock
.connect((self
.host
, self
.port
))
228 if isinstance(self
, http
.client
.HTTPSConnection
):
229 self
.sock
= self
._context
.wrap_socket(self
.sock
, server_hostname
=self
.host
)
231 return SocksConnection
234 class RedirectHandler(urllib
.request
.HTTPRedirectHandler
):
235 """YoutubeDL redirect handler
237 The code is based on HTTPRedirectHandler implementation from CPython [1].
239 This redirect handler fixes and improves the logic to better align with RFC7261
240 and what browsers tend to do [2][3]
242 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
243 2. https://datatracker.ietf.org/doc/html/rfc7231
244 3. https://github.com/python/cpython/issues/91306
247 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
249 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
250 if code
not in (301, 302, 303, 307, 308):
251 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
255 # Technically the Cookie header should be in unredirected_hdrs,
256 # however in practice some may set it in normal headers anyway.
257 # We will remove it here to prevent any leaks.
258 remove_headers
= ['Cookie']
260 new_method
= get_redirect_method(req
.get_method(), code
)
261 # only remove payload if method changed (e.g. POST to GET)
262 if new_method
!= req
.get_method():
264 remove_headers
.extend(['Content-Length', 'Content-Type'])
266 new_headers
= {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
268 return urllib
.request
.Request(
269 newurl
, headers
=new_headers
, origin_req_host
=req
.origin_req_host
,
270 unverifiable
=True, method
=new_method
, data
=new_data
)
273 class ProxyHandler(urllib
.request
.BaseHandler
):
276 def __init__(self
, proxies
=None):
277 self
.proxies
= proxies
278 # Set default handlers
279 for type in ('http', 'https', 'ftp'):
280 setattr(self
, '%s_open' % type, lambda r
, meth
=self
.proxy_open
: meth(r
))
282 def proxy_open(self
, req
):
283 proxy
= select_proxy(req
.get_full_url(), self
.proxies
)
286 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
287 req
.add_header('Ytdl-socks-proxy', proxy
)
288 # yt-dlp's http/https handlers do wrapping the socket with socks
290 return urllib
.request
.ProxyHandler
.proxy_open(
291 self
, req
, proxy
, None)
294 class PUTRequest(urllib
.request
.Request
):
295 def get_method(self
):
299 class HEADRequest(urllib
.request
.Request
):
300 def get_method(self
):
304 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
305 req_headers
= req
.headers
.copy()
306 req_headers
.update(headers
or {})
307 req_data
= data
if data
is not None else req
.data
308 req_url
= update_url_query(url
or req
.get_full_url(), query
)
309 req_get_method
= req
.get_method()
310 if req_get_method
== 'HEAD':
311 req_type
= HEADRequest
312 elif req_get_method
== 'PUT':
313 req_type
= PUTRequest
315 req_type
= urllib
.request
.Request
317 req_url
, data
=req_data
, headers
=req_headers
,
318 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
319 if hasattr(req
, 'timeout'):
320 new_req
.timeout
= req
.timeout
324 class UrllibResponseAdapter(Response
):
326 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
329 def __init__(self
, res
: http
.client
.HTTPResponse | urllib
.response
.addinfourl
):
330 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
331 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
332 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
333 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
335 fp
=res
, headers
=res
.headers
, url
=res
.url
,
336 status
=getattr(res
, 'status', None) or res
.getcode(), reason
=getattr(res
, 'reason', None))
338 def read(self
, amt
=None):
340 return self
.fp
.read(amt
)
341 except Exception as e
:
342 handle_response_read_exceptions(e
)
346 def handle_sslerror(e
: ssl
.SSLError
):
347 if not isinstance(e
, ssl
.SSLError
):
349 if isinstance(e
, ssl
.SSLCertVerificationError
):
350 raise CertificateVerifyError(cause
=e
) from e
351 raise SSLError(cause
=e
) from e
354 def handle_response_read_exceptions(e
):
355 if isinstance(e
, http
.client
.IncompleteRead
):
356 raise IncompleteRead(partial
=e
.partial
, cause
=e
, expected
=e
.expected
) from e
357 elif isinstance(e
, ssl
.SSLError
):
359 elif isinstance(e
, (OSError, EOFError, http
.client
.HTTPException
, *CONTENT_DECODE_ERRORS
)):
360 # OSErrors raised here should mostly be network related
361 raise TransportError(cause
=e
) from e
365 class UrllibRH(RequestHandler
, InstanceStoreMixin
):
366 _SUPPORTED_URL_SCHEMES
= ('http', 'https', 'data', 'ftp')
367 _SUPPORTED_PROXY_SCHEMES
= ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
368 _SUPPORTED_FEATURES
= (Features
.NO_PROXY
, Features
.ALL_PROXY
)
371 def __init__(self
, *, enable_file_urls
: bool = False, **kwargs
):
372 super().__init
__(**kwargs
)
373 self
.enable_file_urls
= enable_file_urls
374 if self
.enable_file_urls
:
375 self
._SUPPORTED
_URL
_SCHEMES
= (*self
._SUPPORTED
_URL
_SCHEMES
, 'file')
377 def _check_extensions(self
, extensions
):
378 super()._check
_extensions
(extensions
)
379 extensions
.pop('cookiejar', None)
380 extensions
.pop('timeout', None)
382 def _create_instance(self
, proxies
, cookiejar
):
383 opener
= urllib
.request
.OpenerDirector()
385 ProxyHandler(proxies
),
387 debuglevel
=int(bool(self
.verbose
)),
388 context
=self
._make
_sslcontext
(),
389 source_address
=self
.source_address
),
390 HTTPCookieProcessor(cookiejar
),
393 HTTPDefaultErrorHandler(),
395 HTTPErrorProcessor(),
399 if self
.enable_file_urls
:
400 handlers
.append(FileHandler())
402 for handler
in handlers
:
403 opener
.add_handler(handler
)
405 # Delete the default user-agent header, which would otherwise apply in
406 # cases where our custom HTTP handler doesn't come into play
407 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
408 opener
.addheaders
= []
411 def _send(self
, request
):
412 headers
= self
._merge
_headers
(request
.headers
)
413 add_accept_encoding_header(headers
, SUPPORTED_ENCODINGS
)
414 urllib_req
= urllib
.request
.Request(
417 headers
=dict(headers
),
418 method
=request
.method
421 opener
= self
._get
_instance
(
422 proxies
=request
.proxies
or self
.proxies
,
423 cookiejar
=request
.extensions
.get('cookiejar') or self
.cookiejar
426 res
= opener
.open(urllib_req
, timeout
=float(request
.extensions
.get('timeout') or self
.timeout
))
427 except urllib
.error
.HTTPError
as e
:
428 if isinstance(e
.fp
, (http
.client
.HTTPResponse
, urllib
.response
.addinfourl
)):
429 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
430 e
._closer
.file = None
431 raise HTTPError(UrllibResponseAdapter(e
.fp
), redirect_loop
='redirect error' in str(e
)) from e
433 except urllib
.error
.URLError
as e
:
434 cause
= e
.reason
# NOTE: cause may be a string
437 if 'tunnel connection failed' in str(cause
).lower() or isinstance(cause
, SocksProxyError
):
438 raise ProxyError(cause
=e
) from e
440 handle_response_read_exceptions(cause
)
441 raise TransportError(cause
=e
) from e
442 except (http
.client
.InvalidURL
, ValueError) as e
:
444 # http.client.HTTPConnection raises ValueError in some validation cases
445 # such as if request method contains illegal control characters [1]
446 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
447 raise RequestError(cause
=e
) from e
448 except Exception as e
:
449 handle_response_read_exceptions(e
)
452 return UrllibResponseAdapter(res
)