1 from __future__
import annotations
12 import urllib
.response
14 from urllib
.request
import (
19 HTTPDefaultErrorHandler
,
24 from ._helper
import (
26 add_accept_encoding_header
,
28 make_socks_proxy_opts
,
31 from .common
import Features
, RequestHandler
, Response
, register_rh
32 from .exceptions
import (
33 CertificateVerifyError
,
41 from ..dependencies
import brotli
42 from ..socks
import ProxyError
as SocksProxyError
43 from ..socks
import sockssocket
44 from ..utils
import update_url_query
45 from ..utils
.networking
import normalize_url
47 SUPPORTED_ENCODINGS
= ['gzip', 'deflate']
48 CONTENT_DECODE_ERRORS
= [zlib
.error
, OSError]
51 SUPPORTED_ENCODINGS
.append('br')
52 CONTENT_DECODE_ERRORS
.append(brotli
.error
)
55 def _create_http_connection(http_class
, source_address
, *args
, **kwargs
):
56 hc
= http_class(*args
, **kwargs
)
58 if source_address
is not None:
59 # This is to workaround _create_connection() from socket where it will try all
60 # address data from getaddrinfo() including IPv6. This filters the result from
61 # getaddrinfo() based on the source_address value.
62 # This is based on the cpython socket.create_connection() function.
63 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
64 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
67 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
68 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
69 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
70 if addrs
and not ip_addrs
:
71 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
73 "No remote IP%s addresses available for connect, can't use '%s' as source address"
74 % (ip_version
, source_address
[0]))
76 af
, socktype
, proto
, canonname
, sa
= res
79 sock
= socket
.socket(af
, socktype
, proto
)
80 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
81 sock
.settimeout(timeout
)
82 sock
.bind(source_address
)
84 err
= None # Explicitly break reference cycle
93 raise OSError('getaddrinfo returns an empty list')
94 if hasattr(hc
, '_create_connection'):
95 hc
._create
_connection
= _create_connection
96 hc
.source_address
= (source_address
, 0)
101 class HTTPHandler(urllib
.request
.AbstractHTTPHandler
):
102 """Handler for HTTP requests and responses.
104 This class, when installed with an OpenerDirector, automatically adds
105 the standard headers to every HTTP request and handles gzipped, deflated and
106 brotli responses from web servers.
108 Part of this code was copied from:
110 http://techknack.net/python-urllib2-handlers/
112 Andrew Rowls, the author of that code, agreed to release it to the
116 def __init__(self
, context
=None, source_address
=None, *args
, **kwargs
):
117 super().__init
__(*args
, **kwargs
)
118 self
._source
_address
= source_address
119 self
._context
= context
122 def _make_conn_class(base
, req
):
124 socks_proxy
= req
.headers
.pop('Ytdl-socks-proxy', None)
126 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
129 def http_open(self
, req
):
130 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPConnection
, req
)
131 return self
.do_open(functools
.partial(
132 _create_http_connection
, conn_class
, self
._source
_address
), req
)
134 def https_open(self
, req
):
135 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPSConnection
, req
)
138 _create_http_connection
, conn_class
, self
._source
_address
),
139 req
, context
=self
._context
)
146 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
148 return zlib
.decompress(data
)
154 return brotli
.decompress(data
)
158 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(data
), mode
='rb')
161 except OSError as original_oserror
:
162 # There may be junk add the end of the file
163 # See http://stackoverflow.com/q/4928560/35070 for details
164 for i
in range(1, 1024):
166 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(data
[:-i
]), mode
='rb')
171 raise original_oserror
173 def http_request(self
, req
):
174 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
175 # always respected by websites, some tend to give out URLs with non percent-encoded
176 # non-ASCII characters (see telemb.py, ard.py [#3412])
177 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
178 # To work around aforementioned issue we will replace request's original URL with
179 # percent-encoded one
180 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
181 # the code of this workaround has been moved here from YoutubeDL.urlopen()
182 url
= req
.get_full_url()
183 url_escaped
= normalize_url(url
)
185 # Substitute URL if any change after escaping
186 if url
!= url_escaped
:
187 req
= update_Request(req
, url
=url_escaped
)
189 return super().do_request_(req
)
191 def http_response(self
, req
, resp
):
194 # Content-Encoding header lists the encodings in order that they were applied [1].
195 # To decompress, we simply do the reverse.
196 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
197 decoded_response
= None
198 for encoding
in (e
.strip() for e
in reversed(resp
.headers
.get('Content-encoding', '').split(','))):
199 if encoding
== 'gzip':
200 decoded_response
= self
.gz(decoded_response
or resp
.read())
201 elif encoding
== 'deflate':
202 decoded_response
= self
.deflate(decoded_response
or resp
.read())
203 elif encoding
== 'br' and brotli
:
204 decoded_response
= self
.brotli(decoded_response
or resp
.read())
206 if decoded_response
is not None:
207 resp
= urllib
.request
.addinfourl(io
.BytesIO(decoded_response
), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
208 resp
.msg
= old_resp
.msg
209 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
210 # https://github.com/ytdl-org/youtube-dl/issues/6457).
211 if 300 <= resp
.code
< 400:
212 location
= resp
.headers
.get('Location')
214 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
215 location
= location
.encode('iso-8859-1').decode()
216 location_escaped
= normalize_url(location
)
217 if location
!= location_escaped
:
218 del resp
.headers
['Location']
219 resp
.headers
['Location'] = location_escaped
222 https_request
= http_request
223 https_response
= http_response
226 def make_socks_conn_class(base_class
, socks_proxy
):
227 assert issubclass(base_class
, (
228 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
230 proxy_args
= make_socks_proxy_opts(socks_proxy
)
232 class SocksConnection(base_class
):
234 self
.sock
= sockssocket()
235 self
.sock
.setproxy(**proxy_args
)
236 if type(self
.timeout
) in (int, float): # noqa: E721
237 self
.sock
.settimeout(self
.timeout
)
238 self
.sock
.connect((self
.host
, self
.port
))
240 if isinstance(self
, http
.client
.HTTPSConnection
):
241 self
.sock
= self
._context
.wrap_socket(self
.sock
, server_hostname
=self
.host
)
243 return SocksConnection
246 class RedirectHandler(urllib
.request
.HTTPRedirectHandler
):
247 """YoutubeDL redirect handler
249 The code is based on HTTPRedirectHandler implementation from CPython [1].
251 This redirect handler fixes and improves the logic to better align with RFC7261
252 and what browsers tend to do [2][3]
254 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
255 2. https://datatracker.ietf.org/doc/html/rfc7231
256 3. https://github.com/python/cpython/issues/91306
259 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
261 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
262 if code
not in (301, 302, 303, 307, 308):
263 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
267 # Technically the Cookie header should be in unredirected_hdrs,
268 # however in practice some may set it in normal headers anyway.
269 # We will remove it here to prevent any leaks.
270 remove_headers
= ['Cookie']
272 new_method
= get_redirect_method(req
.get_method(), code
)
273 # only remove payload if method changed (e.g. POST to GET)
274 if new_method
!= req
.get_method():
276 remove_headers
.extend(['Content-Length', 'Content-Type'])
278 new_headers
= {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
280 return urllib
.request
.Request(
281 newurl
, headers
=new_headers
, origin_req_host
=req
.origin_req_host
,
282 unverifiable
=True, method
=new_method
, data
=new_data
)
285 class ProxyHandler(urllib
.request
.BaseHandler
):
288 def __init__(self
, proxies
=None):
289 self
.proxies
= proxies
290 # Set default handlers
291 for type in ('http', 'https', 'ftp'):
292 setattr(self
, '%s_open' % type, lambda r
, meth
=self
.proxy_open
: meth(r
))
294 def proxy_open(self
, req
):
295 proxy
= select_proxy(req
.get_full_url(), self
.proxies
)
298 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
299 req
.add_header('Ytdl-socks-proxy', proxy
)
300 # yt-dlp's http/https handlers do wrapping the socket with socks
302 return urllib
.request
.ProxyHandler
.proxy_open(
303 self
, req
, proxy
, None)
306 class PUTRequest(urllib
.request
.Request
):
307 def get_method(self
):
311 class HEADRequest(urllib
.request
.Request
):
312 def get_method(self
):
316 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
317 req_headers
= req
.headers
.copy()
318 req_headers
.update(headers
or {})
319 req_data
= data
if data
is not None else req
.data
320 req_url
= update_url_query(url
or req
.get_full_url(), query
)
321 req_get_method
= req
.get_method()
322 if req_get_method
== 'HEAD':
323 req_type
= HEADRequest
324 elif req_get_method
== 'PUT':
325 req_type
= PUTRequest
327 req_type
= urllib
.request
.Request
329 req_url
, data
=req_data
, headers
=req_headers
,
330 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
331 if hasattr(req
, 'timeout'):
332 new_req
.timeout
= req
.timeout
336 class UrllibResponseAdapter(Response
):
338 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
341 def __init__(self
, res
: http
.client
.HTTPResponse | urllib
.response
.addinfourl
):
342 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
343 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
344 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
345 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
347 fp
=res
, headers
=res
.headers
, url
=res
.url
,
348 status
=getattr(res
, 'status', None) or res
.getcode(), reason
=getattr(res
, 'reason', None))
350 def read(self
, amt
=None):
352 return self
.fp
.read(amt
)
353 except Exception as e
:
354 handle_response_read_exceptions(e
)
358 def handle_sslerror(e
: ssl
.SSLError
):
359 if not isinstance(e
, ssl
.SSLError
):
361 if isinstance(e
, ssl
.SSLCertVerificationError
):
362 raise CertificateVerifyError(cause
=e
) from e
363 raise SSLError(cause
=e
) from e
366 def handle_response_read_exceptions(e
):
367 if isinstance(e
, http
.client
.IncompleteRead
):
368 raise IncompleteRead(partial
=e
.partial
, cause
=e
, expected
=e
.expected
) from e
369 elif isinstance(e
, ssl
.SSLError
):
371 elif isinstance(e
, (OSError, EOFError, http
.client
.HTTPException
, *CONTENT_DECODE_ERRORS
)):
372 # OSErrors raised here should mostly be network related
373 raise TransportError(cause
=e
) from e
377 class UrllibRH(RequestHandler
, InstanceStoreMixin
):
378 _SUPPORTED_URL_SCHEMES
= ('http', 'https', 'data', 'ftp')
379 _SUPPORTED_PROXY_SCHEMES
= ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
380 _SUPPORTED_FEATURES
= (Features
.NO_PROXY
, Features
.ALL_PROXY
)
383 def __init__(self
, *, enable_file_urls
: bool = False, **kwargs
):
384 super().__init
__(**kwargs
)
385 self
.enable_file_urls
= enable_file_urls
386 if self
.enable_file_urls
:
387 self
._SUPPORTED
_URL
_SCHEMES
= (*self
._SUPPORTED
_URL
_SCHEMES
, 'file')
389 def _check_extensions(self
, extensions
):
390 super()._check
_extensions
(extensions
)
391 extensions
.pop('cookiejar', None)
392 extensions
.pop('timeout', None)
394 def _create_instance(self
, proxies
, cookiejar
):
395 opener
= urllib
.request
.OpenerDirector()
397 ProxyHandler(proxies
),
399 debuglevel
=int(bool(self
.verbose
)),
400 context
=self
._make
_sslcontext
(),
401 source_address
=self
.source_address
),
402 HTTPCookieProcessor(cookiejar
),
405 HTTPDefaultErrorHandler(),
407 HTTPErrorProcessor(),
411 if self
.enable_file_urls
:
412 handlers
.append(FileHandler())
414 for handler
in handlers
:
415 opener
.add_handler(handler
)
417 # Delete the default user-agent header, which would otherwise apply in
418 # cases where our custom HTTP handler doesn't come into play
419 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
420 opener
.addheaders
= []
423 def _send(self
, request
):
424 headers
= self
._merge
_headers
(request
.headers
)
425 add_accept_encoding_header(headers
, SUPPORTED_ENCODINGS
)
426 urllib_req
= urllib
.request
.Request(
429 headers
=dict(headers
),
430 method
=request
.method
433 opener
= self
._get
_instance
(
434 proxies
=request
.proxies
or self
.proxies
,
435 cookiejar
=request
.extensions
.get('cookiejar') or self
.cookiejar
438 res
= opener
.open(urllib_req
, timeout
=float(request
.extensions
.get('timeout') or self
.timeout
))
439 except urllib
.error
.HTTPError
as e
:
440 if isinstance(e
.fp
, (http
.client
.HTTPResponse
, urllib
.response
.addinfourl
)):
441 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
442 e
._closer
.file = None
443 raise HTTPError(UrllibResponseAdapter(e
.fp
), redirect_loop
='redirect error' in str(e
)) from e
445 except urllib
.error
.URLError
as e
:
446 cause
= e
.reason
# NOTE: cause may be a string
449 if 'tunnel connection failed' in str(cause
).lower() or isinstance(cause
, SocksProxyError
):
450 raise ProxyError(cause
=e
) from e
452 handle_response_read_exceptions(cause
)
453 raise TransportError(cause
=e
) from e
454 except (http
.client
.InvalidURL
, ValueError) as e
:
456 # http.client.HTTPConnection raises ValueError in some validation cases
457 # such as if request method contains illegal control characters [1]
458 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
459 raise RequestError(cause
=e
) from e
460 except Exception as e
:
461 handle_response_read_exceptions(e
)
464 return UrllibResponseAdapter(res
)