1 from __future__
import annotations
12 import urllib
.response
14 from urllib
.request
import (
19 HTTPDefaultErrorHandler
,
24 from ._helper
import (
26 add_accept_encoding_header
,
28 make_socks_proxy_opts
,
31 from .common
import Features
, RequestHandler
, Response
, register_rh
32 from .exceptions
import (
33 CertificateVerifyError
,
41 from ..dependencies
import brotli
42 from ..socks
import ProxyError
as SocksProxyError
43 from ..socks
import sockssocket
44 from ..utils
import escape_url
, update_url_query
46 SUPPORTED_ENCODINGS
= ['gzip', 'deflate']
47 CONTENT_DECODE_ERRORS
= [zlib
.error
, OSError]
50 SUPPORTED_ENCODINGS
.append('br')
51 CONTENT_DECODE_ERRORS
.append(brotli
.error
)
54 def _create_http_connection(http_class
, source_address
, *args
, **kwargs
):
55 hc
= http_class(*args
, **kwargs
)
57 if source_address
is not None:
58 # This is to workaround _create_connection() from socket where it will try all
59 # address data from getaddrinfo() including IPv6. This filters the result from
60 # getaddrinfo() based on the source_address value.
61 # This is based on the cpython socket.create_connection() function.
62 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
63 def _create_connection(address
, timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
, source_address
=None):
66 addrs
= socket
.getaddrinfo(host
, port
, 0, socket
.SOCK_STREAM
)
67 af
= socket
.AF_INET
if '.' in source_address
[0] else socket
.AF_INET6
68 ip_addrs
= [addr
for addr
in addrs
if addr
[0] == af
]
69 if addrs
and not ip_addrs
:
70 ip_version
= 'v4' if af
== socket
.AF_INET
else 'v6'
72 "No remote IP%s addresses available for connect, can't use '%s' as source address"
73 % (ip_version
, source_address
[0]))
75 af
, socktype
, proto
, canonname
, sa
= res
78 sock
= socket
.socket(af
, socktype
, proto
)
79 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
:
80 sock
.settimeout(timeout
)
81 sock
.bind(source_address
)
83 err
= None # Explicitly break reference cycle
92 raise OSError('getaddrinfo returns an empty list')
93 if hasattr(hc
, '_create_connection'):
94 hc
._create
_connection
= _create_connection
95 hc
.source_address
= (source_address
, 0)
100 class HTTPHandler(urllib
.request
.AbstractHTTPHandler
):
101 """Handler for HTTP requests and responses.
103 This class, when installed with an OpenerDirector, automatically adds
104 the standard headers to every HTTP request and handles gzipped, deflated and
105 brotli responses from web servers.
107 Part of this code was copied from:
109 http://techknack.net/python-urllib2-handlers/
111 Andrew Rowls, the author of that code, agreed to release it to the
115 def __init__(self
, context
=None, source_address
=None, *args
, **kwargs
):
116 super().__init
__(*args
, **kwargs
)
117 self
._source
_address
= source_address
118 self
._context
= context
121 def _make_conn_class(base
, req
):
123 socks_proxy
= req
.headers
.pop('Ytdl-socks-proxy', None)
125 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
128 def http_open(self
, req
):
129 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPConnection
, req
)
130 return self
.do_open(functools
.partial(
131 _create_http_connection
, conn_class
, self
._source
_address
), req
)
133 def https_open(self
, req
):
134 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPSConnection
, req
)
137 _create_http_connection
, conn_class
, self
._source
_address
),
138 req
, context
=self
._context
)
145 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
147 return zlib
.decompress(data
)
153 return brotli
.decompress(data
)
157 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(data
), mode
='rb')
160 except OSError as original_oserror
:
161 # There may be junk add the end of the file
162 # See http://stackoverflow.com/q/4928560/35070 for details
163 for i
in range(1, 1024):
165 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(data
[:-i
]), mode
='rb')
170 raise original_oserror
172 def http_request(self
, req
):
173 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
174 # always respected by websites, some tend to give out URLs with non percent-encoded
175 # non-ASCII characters (see telemb.py, ard.py [#3412])
176 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
177 # To work around aforementioned issue we will replace request's original URL with
178 # percent-encoded one
179 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
180 # the code of this workaround has been moved here from YoutubeDL.urlopen()
181 url
= req
.get_full_url()
182 url_escaped
= escape_url(url
)
184 # Substitute URL if any change after escaping
185 if url
!= url_escaped
:
186 req
= update_Request(req
, url
=url_escaped
)
188 return super().do_request_(req
)
190 def http_response(self
, req
, resp
):
193 # Content-Encoding header lists the encodings in order that they were applied [1].
194 # To decompress, we simply do the reverse.
195 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
196 decoded_response
= None
197 for encoding
in (e
.strip() for e
in reversed(resp
.headers
.get('Content-encoding', '').split(','))):
198 if encoding
== 'gzip':
199 decoded_response
= self
.gz(decoded_response
or resp
.read())
200 elif encoding
== 'deflate':
201 decoded_response
= self
.deflate(decoded_response
or resp
.read())
202 elif encoding
== 'br' and brotli
:
203 decoded_response
= self
.brotli(decoded_response
or resp
.read())
205 if decoded_response
is not None:
206 resp
= urllib
.request
.addinfourl(io
.BytesIO(decoded_response
), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
207 resp
.msg
= old_resp
.msg
208 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
209 # https://github.com/ytdl-org/youtube-dl/issues/6457).
210 if 300 <= resp
.code
< 400:
211 location
= resp
.headers
.get('Location')
213 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
214 location
= location
.encode('iso-8859-1').decode()
215 location_escaped
= escape_url(location
)
216 if location
!= location_escaped
:
217 del resp
.headers
['Location']
218 resp
.headers
['Location'] = location_escaped
221 https_request
= http_request
222 https_response
= http_response
225 def make_socks_conn_class(base_class
, socks_proxy
):
226 assert issubclass(base_class
, (
227 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
229 proxy_args
= make_socks_proxy_opts(socks_proxy
)
231 class SocksConnection(base_class
):
233 self
.sock
= sockssocket()
234 self
.sock
.setproxy(**proxy_args
)
235 if type(self
.timeout
) in (int, float): # noqa: E721
236 self
.sock
.settimeout(self
.timeout
)
237 self
.sock
.connect((self
.host
, self
.port
))
239 if isinstance(self
, http
.client
.HTTPSConnection
):
240 self
.sock
= self
._context
.wrap_socket(self
.sock
, server_hostname
=self
.host
)
242 return SocksConnection
245 class RedirectHandler(urllib
.request
.HTTPRedirectHandler
):
246 """YoutubeDL redirect handler
248 The code is based on HTTPRedirectHandler implementation from CPython [1].
250 This redirect handler fixes and improves the logic to better align with RFC7261
251 and what browsers tend to do [2][3]
253 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
254 2. https://datatracker.ietf.org/doc/html/rfc7231
255 3. https://github.com/python/cpython/issues/91306
258 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
260 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
261 if code
not in (301, 302, 303, 307, 308):
262 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
266 # Technically the Cookie header should be in unredirected_hdrs,
267 # however in practice some may set it in normal headers anyway.
268 # We will remove it here to prevent any leaks.
269 remove_headers
= ['Cookie']
271 new_method
= get_redirect_method(req
.get_method(), code
)
272 # only remove payload if method changed (e.g. POST to GET)
273 if new_method
!= req
.get_method():
275 remove_headers
.extend(['Content-Length', 'Content-Type'])
277 new_headers
= {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
279 return urllib
.request
.Request(
280 newurl
, headers
=new_headers
, origin_req_host
=req
.origin_req_host
,
281 unverifiable
=True, method
=new_method
, data
=new_data
)
284 class ProxyHandler(urllib
.request
.BaseHandler
):
287 def __init__(self
, proxies
=None):
288 self
.proxies
= proxies
289 # Set default handlers
290 for type in ('http', 'https', 'ftp'):
291 setattr(self
, '%s_open' % type, lambda r
, meth
=self
.proxy_open
: meth(r
))
293 def proxy_open(self
, req
):
294 proxy
= select_proxy(req
.get_full_url(), self
.proxies
)
297 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
298 req
.add_header('Ytdl-socks-proxy', proxy
)
299 # yt-dlp's http/https handlers do wrapping the socket with socks
301 return urllib
.request
.ProxyHandler
.proxy_open(
302 self
, req
, proxy
, None)
305 class PUTRequest(urllib
.request
.Request
):
306 def get_method(self
):
310 class HEADRequest(urllib
.request
.Request
):
311 def get_method(self
):
315 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
316 req_headers
= req
.headers
.copy()
317 req_headers
.update(headers
or {})
318 req_data
= data
if data
is not None else req
.data
319 req_url
= update_url_query(url
or req
.get_full_url(), query
)
320 req_get_method
= req
.get_method()
321 if req_get_method
== 'HEAD':
322 req_type
= HEADRequest
323 elif req_get_method
== 'PUT':
324 req_type
= PUTRequest
326 req_type
= urllib
.request
.Request
328 req_url
, data
=req_data
, headers
=req_headers
,
329 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
330 if hasattr(req
, 'timeout'):
331 new_req
.timeout
= req
.timeout
335 class UrllibResponseAdapter(Response
):
337 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
340 def __init__(self
, res
: http
.client
.HTTPResponse | urllib
.response
.addinfourl
):
341 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
342 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
343 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
344 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
346 fp
=res
, headers
=res
.headers
, url
=res
.url
,
347 status
=getattr(res
, 'status', None) or res
.getcode(), reason
=getattr(res
, 'reason', None))
349 def read(self
, amt
=None):
351 return self
.fp
.read(amt
)
352 except Exception as e
:
353 handle_response_read_exceptions(e
)
357 def handle_sslerror(e
: ssl
.SSLError
):
358 if not isinstance(e
, ssl
.SSLError
):
360 if isinstance(e
, ssl
.SSLCertVerificationError
):
361 raise CertificateVerifyError(cause
=e
) from e
362 raise SSLError(cause
=e
) from e
365 def handle_response_read_exceptions(e
):
366 if isinstance(e
, http
.client
.IncompleteRead
):
367 raise IncompleteRead(partial
=e
.partial
, cause
=e
, expected
=e
.expected
) from e
368 elif isinstance(e
, ssl
.SSLError
):
370 elif isinstance(e
, (OSError, EOFError, http
.client
.HTTPException
, *CONTENT_DECODE_ERRORS
)):
371 # OSErrors raised here should mostly be network related
372 raise TransportError(cause
=e
) from e
376 class UrllibRH(RequestHandler
, InstanceStoreMixin
):
377 _SUPPORTED_URL_SCHEMES
= ('http', 'https', 'data', 'ftp')
378 _SUPPORTED_PROXY_SCHEMES
= ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
379 _SUPPORTED_FEATURES
= (Features
.NO_PROXY
, Features
.ALL_PROXY
)
382 def __init__(self
, *, enable_file_urls
: bool = False, **kwargs
):
383 super().__init
__(**kwargs
)
384 self
.enable_file_urls
= enable_file_urls
385 if self
.enable_file_urls
:
386 self
._SUPPORTED
_URL
_SCHEMES
= (*self
._SUPPORTED
_URL
_SCHEMES
, 'file')
388 def _check_extensions(self
, extensions
):
389 super()._check
_extensions
(extensions
)
390 extensions
.pop('cookiejar', None)
391 extensions
.pop('timeout', None)
393 def _create_instance(self
, proxies
, cookiejar
):
394 opener
= urllib
.request
.OpenerDirector()
396 ProxyHandler(proxies
),
398 debuglevel
=int(bool(self
.verbose
)),
399 context
=self
._make
_sslcontext
(),
400 source_address
=self
.source_address
),
401 HTTPCookieProcessor(cookiejar
),
404 HTTPDefaultErrorHandler(),
406 HTTPErrorProcessor(),
410 if self
.enable_file_urls
:
411 handlers
.append(FileHandler())
413 for handler
in handlers
:
414 opener
.add_handler(handler
)
416 # Delete the default user-agent header, which would otherwise apply in
417 # cases where our custom HTTP handler doesn't come into play
418 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
419 opener
.addheaders
= []
422 def _send(self
, request
):
423 headers
= self
._merge
_headers
(request
.headers
)
424 add_accept_encoding_header(headers
, SUPPORTED_ENCODINGS
)
425 urllib_req
= urllib
.request
.Request(
428 headers
=dict(headers
),
429 method
=request
.method
432 opener
= self
._get
_instance
(
433 proxies
=request
.proxies
or self
.proxies
,
434 cookiejar
=request
.extensions
.get('cookiejar') or self
.cookiejar
437 res
= opener
.open(urllib_req
, timeout
=float(request
.extensions
.get('timeout') or self
.timeout
))
438 except urllib
.error
.HTTPError
as e
:
439 if isinstance(e
.fp
, (http
.client
.HTTPResponse
, urllib
.response
.addinfourl
)):
440 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
441 e
._closer
.file = None
442 raise HTTPError(UrllibResponseAdapter(e
.fp
), redirect_loop
='redirect error' in str(e
)) from e
444 except urllib
.error
.URLError
as e
:
445 cause
= e
.reason
# NOTE: cause may be a string
448 if 'tunnel connection failed' in str(cause
).lower() or isinstance(cause
, SocksProxyError
):
449 raise ProxyError(cause
=e
) from e
451 handle_response_read_exceptions(cause
)
452 raise TransportError(cause
=e
) from e
453 except (http
.client
.InvalidURL
, ValueError) as e
:
455 # http.client.HTTPConnection raises ValueError in some validation cases
456 # such as if request method contains illegal control characters [1]
457 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
458 raise RequestError(cause
=e
) from e
459 except Exception as e
:
460 handle_response_read_exceptions(e
)
463 return UrllibResponseAdapter(res
)