1 from __future__
import annotations
11 import urllib
.response
13 from urllib
.request
import (
18 HTTPDefaultErrorHandler
,
23 from ._helper
import (
25 add_accept_encoding_header
,
28 make_socks_proxy_opts
,
31 from .common
import Features
, RequestHandler
, Response
, register_rh
32 from .exceptions
import (
33 CertificateVerifyError
,
41 from ..dependencies
import brotli
42 from ..socks
import ProxyError
as SocksProxyError
43 from ..socks
import sockssocket
44 from ..utils
import update_url_query
45 from ..utils
.networking
import normalize_url
47 SUPPORTED_ENCODINGS
= ['gzip', 'deflate']
48 CONTENT_DECODE_ERRORS
= [zlib
.error
, OSError]
51 SUPPORTED_ENCODINGS
.append('br')
52 CONTENT_DECODE_ERRORS
.append(brotli
.error
)
55 def _create_http_connection(http_class
, source_address
, *args
, **kwargs
):
56 hc
= http_class(*args
, **kwargs
)
58 if hasattr(hc
, '_create_connection'):
59 hc
._create
_connection
= create_connection
61 if source_address
is not None:
62 hc
.source_address
= (source_address
, 0)
67 class HTTPHandler(urllib
.request
.AbstractHTTPHandler
):
68 """Handler for HTTP requests and responses.
70 This class, when installed with an OpenerDirector, automatically adds
71 the standard headers to every HTTP request and handles gzipped, deflated and
72 brotli responses from web servers.
74 Part of this code was copied from:
76 http://techknack.net/python-urllib2-handlers/
78 Andrew Rowls, the author of that code, agreed to release it to the
82 def __init__(self
, context
=None, source_address
=None, *args
, **kwargs
):
83 super().__init
__(*args
, **kwargs
)
84 self
._source
_address
= source_address
85 self
._context
= context
88 def _make_conn_class(base
, req
):
90 socks_proxy
= req
.headers
.pop('Ytdl-socks-proxy', None)
92 conn_class
= make_socks_conn_class(conn_class
, socks_proxy
)
95 def http_open(self
, req
):
96 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPConnection
, req
)
97 return self
.do_open(functools
.partial(
98 _create_http_connection
, conn_class
, self
._source
_address
), req
)
100 def https_open(self
, req
):
101 conn_class
= self
._make
_conn
_class
(http
.client
.HTTPSConnection
, req
)
104 _create_http_connection
, conn_class
, self
._source
_address
),
105 req
, context
=self
._context
)
112 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
114 return zlib
.decompress(data
)
120 return brotli
.decompress(data
)
124 # There may be junk added the end of the file
125 # We ignore it by only ever decoding a single gzip payload
128 return zlib
.decompress(data
, wbits
=zlib
.MAX_WBITS |
16)
130 def http_request(self
, req
):
131 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
132 # always respected by websites, some tend to give out URLs with non percent-encoded
133 # non-ASCII characters (see telemb.py, ard.py [#3412])
134 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
135 # To work around aforementioned issue we will replace request's original URL with
136 # percent-encoded one
137 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
138 # the code of this workaround has been moved here from YoutubeDL.urlopen()
139 url
= req
.get_full_url()
140 url_escaped
= normalize_url(url
)
142 # Substitute URL if any change after escaping
143 if url
!= url_escaped
:
144 req
= update_Request(req
, url
=url_escaped
)
146 return super().do_request_(req
)
148 def http_response(self
, req
, resp
):
151 # Content-Encoding header lists the encodings in order that they were applied [1].
152 # To decompress, we simply do the reverse.
153 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
154 decoded_response
= None
155 for encoding
in (e
.strip() for e
in reversed(resp
.headers
.get('Content-encoding', '').split(','))):
156 if encoding
== 'gzip':
157 decoded_response
= self
.gz(decoded_response
or resp
.read())
158 elif encoding
== 'deflate':
159 decoded_response
= self
.deflate(decoded_response
or resp
.read())
160 elif encoding
== 'br' and brotli
:
161 decoded_response
= self
.brotli(decoded_response
or resp
.read())
163 if decoded_response
is not None:
164 resp
= urllib
.request
.addinfourl(io
.BytesIO(decoded_response
), old_resp
.headers
, old_resp
.url
, old_resp
.code
)
165 resp
.msg
= old_resp
.msg
166 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
167 # https://github.com/ytdl-org/youtube-dl/issues/6457).
168 if 300 <= resp
.code
< 400:
169 location
= resp
.headers
.get('Location')
171 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
172 location
= location
.encode('iso-8859-1').decode()
173 location_escaped
= normalize_url(location
)
174 if location
!= location_escaped
:
175 del resp
.headers
['Location']
176 resp
.headers
['Location'] = location_escaped
179 https_request
= http_request
180 https_response
= http_response
183 def make_socks_conn_class(base_class
, socks_proxy
):
184 assert issubclass(base_class
, (
185 http
.client
.HTTPConnection
, http
.client
.HTTPSConnection
))
187 proxy_args
= make_socks_proxy_opts(socks_proxy
)
189 class SocksConnection(base_class
):
190 _create_connection
= create_connection
193 def sock_socket_connect(ip_addr
, timeout
, source_address
):
194 af
, socktype
, proto
, canonname
, sa
= ip_addr
195 sock
= sockssocket(af
, socktype
, proto
)
197 connect_proxy_args
= proxy_args
.copy()
198 connect_proxy_args
.update({'addr': sa[0], 'port': sa[1]}
)
199 sock
.setproxy(**connect_proxy_args
)
200 if timeout
is not socket
._GLOBAL
_DEFAULT
_TIMEOUT
: # noqa: E721
201 sock
.settimeout(timeout
)
203 sock
.bind(source_address
)
204 sock
.connect((self
.host
, self
.port
))
209 self
.sock
= create_connection(
210 (proxy_args
['addr'], proxy_args
['port']), timeout
=self
.timeout
,
211 source_address
=self
.source_address
, _create_socket_func
=sock_socket_connect
)
212 if isinstance(self
, http
.client
.HTTPSConnection
):
213 self
.sock
= self
._context
.wrap_socket(self
.sock
, server_hostname
=self
.host
)
215 return SocksConnection
218 class RedirectHandler(urllib
.request
.HTTPRedirectHandler
):
219 """YoutubeDL redirect handler
221 The code is based on HTTPRedirectHandler implementation from CPython [1].
223 This redirect handler fixes and improves the logic to better align with RFC7261
224 and what browsers tend to do [2][3]
226 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
227 2. https://datatracker.ietf.org/doc/html/rfc7231
228 3. https://github.com/python/cpython/issues/91306
231 http_error_301
= http_error_303
= http_error_307
= http_error_308
= urllib
.request
.HTTPRedirectHandler
.http_error_302
233 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
234 if code
not in (301, 302, 303, 307, 308):
235 raise urllib
.error
.HTTPError(req
.full_url
, code
, msg
, headers
, fp
)
239 # Technically the Cookie header should be in unredirected_hdrs,
240 # however in practice some may set it in normal headers anyway.
241 # We will remove it here to prevent any leaks.
242 remove_headers
= ['Cookie']
244 new_method
= get_redirect_method(req
.get_method(), code
)
245 # only remove payload if method changed (e.g. POST to GET)
246 if new_method
!= req
.get_method():
248 remove_headers
.extend(['Content-Length', 'Content-Type'])
250 new_headers
= {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
252 return urllib
.request
.Request(
253 newurl
, headers
=new_headers
, origin_req_host
=req
.origin_req_host
,
254 unverifiable
=True, method
=new_method
, data
=new_data
)
257 class ProxyHandler(urllib
.request
.BaseHandler
):
260 def __init__(self
, proxies
=None):
261 self
.proxies
= proxies
262 # Set default handlers
263 for type in ('http', 'https', 'ftp'):
264 setattr(self
, '%s_open' % type, lambda r
, meth
=self
.proxy_open
: meth(r
))
266 def proxy_open(self
, req
):
267 proxy
= select_proxy(req
.get_full_url(), self
.proxies
)
270 if urllib
.parse
.urlparse(proxy
).scheme
.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
271 req
.add_header('Ytdl-socks-proxy', proxy
)
272 # yt-dlp's http/https handlers do wrapping the socket with socks
274 return urllib
.request
.ProxyHandler
.proxy_open(
275 self
, req
, proxy
, None)
278 class PUTRequest(urllib
.request
.Request
):
279 def get_method(self
):
283 class HEADRequest(urllib
.request
.Request
):
284 def get_method(self
):
288 def update_Request(req
, url
=None, data
=None, headers
=None, query
=None):
289 req_headers
= req
.headers
.copy()
290 req_headers
.update(headers
or {})
291 req_data
= data
if data
is not None else req
.data
292 req_url
= update_url_query(url
or req
.get_full_url(), query
)
293 req_get_method
= req
.get_method()
294 if req_get_method
== 'HEAD':
295 req_type
= HEADRequest
296 elif req_get_method
== 'PUT':
297 req_type
= PUTRequest
299 req_type
= urllib
.request
.Request
301 req_url
, data
=req_data
, headers
=req_headers
,
302 origin_req_host
=req
.origin_req_host
, unverifiable
=req
.unverifiable
)
303 if hasattr(req
, 'timeout'):
304 new_req
.timeout
= req
.timeout
308 class UrllibResponseAdapter(Response
):
310 HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
313 def __init__(self
, res
: http
.client
.HTTPResponse | urllib
.response
.addinfourl
):
314 # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
315 # HTTPResponse: .getcode() was deprecated, .status always existed [2]
316 # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
317 # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
319 fp
=res
, headers
=res
.headers
, url
=res
.url
,
320 status
=getattr(res
, 'status', None) or res
.getcode(), reason
=getattr(res
, 'reason', None))
322 def read(self
, amt
=None):
324 return self
.fp
.read(amt
)
325 except Exception as e
:
326 handle_response_read_exceptions(e
)
330 def handle_sslerror(e
: ssl
.SSLError
):
331 if not isinstance(e
, ssl
.SSLError
):
333 if isinstance(e
, ssl
.SSLCertVerificationError
):
334 raise CertificateVerifyError(cause
=e
) from e
335 raise SSLError(cause
=e
) from e
338 def handle_response_read_exceptions(e
):
339 if isinstance(e
, http
.client
.IncompleteRead
):
340 raise IncompleteRead(partial
=len(e
.partial
), cause
=e
, expected
=e
.expected
) from e
341 elif isinstance(e
, ssl
.SSLError
):
343 elif isinstance(e
, (OSError, EOFError, http
.client
.HTTPException
, *CONTENT_DECODE_ERRORS
)):
344 # OSErrors raised here should mostly be network related
345 raise TransportError(cause
=e
) from e
349 class UrllibRH(RequestHandler
, InstanceStoreMixin
):
350 _SUPPORTED_URL_SCHEMES
= ('http', 'https', 'data', 'ftp')
351 _SUPPORTED_PROXY_SCHEMES
= ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
352 _SUPPORTED_FEATURES
= (Features
.NO_PROXY
, Features
.ALL_PROXY
)
355 def __init__(self
, *, enable_file_urls
: bool = False, **kwargs
):
356 super().__init
__(**kwargs
)
357 self
.enable_file_urls
= enable_file_urls
358 if self
.enable_file_urls
:
359 self
._SUPPORTED
_URL
_SCHEMES
= (*self
._SUPPORTED
_URL
_SCHEMES
, 'file')
361 def _check_extensions(self
, extensions
):
362 super()._check
_extensions
(extensions
)
363 extensions
.pop('cookiejar', None)
364 extensions
.pop('timeout', None)
366 def _create_instance(self
, proxies
, cookiejar
):
367 opener
= urllib
.request
.OpenerDirector()
369 ProxyHandler(proxies
),
371 debuglevel
=int(bool(self
.verbose
)),
372 context
=self
._make
_sslcontext
(),
373 source_address
=self
.source_address
),
374 HTTPCookieProcessor(cookiejar
),
377 HTTPDefaultErrorHandler(),
379 HTTPErrorProcessor(),
383 if self
.enable_file_urls
:
384 handlers
.append(FileHandler())
386 for handler
in handlers
:
387 opener
.add_handler(handler
)
389 # Delete the default user-agent header, which would otherwise apply in
390 # cases where our custom HTTP handler doesn't come into play
391 # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
392 opener
.addheaders
= []
395 def _send(self
, request
):
396 headers
= self
._merge
_headers
(request
.headers
)
397 add_accept_encoding_header(headers
, SUPPORTED_ENCODINGS
)
398 urllib_req
= urllib
.request
.Request(
401 headers
=dict(headers
),
402 method
=request
.method
405 opener
= self
._get
_instance
(
406 proxies
=request
.proxies
or self
.proxies
,
407 cookiejar
=request
.extensions
.get('cookiejar') or self
.cookiejar
410 res
= opener
.open(urllib_req
, timeout
=float(request
.extensions
.get('timeout') or self
.timeout
))
411 except urllib
.error
.HTTPError
as e
:
412 if isinstance(e
.fp
, (http
.client
.HTTPResponse
, urllib
.response
.addinfourl
)):
413 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
414 e
._closer
.close_called
= True
415 raise HTTPError(UrllibResponseAdapter(e
.fp
), redirect_loop
='redirect error' in str(e
)) from e
417 except urllib
.error
.URLError
as e
:
418 cause
= e
.reason
# NOTE: cause may be a string
421 if 'tunnel connection failed' in str(cause
).lower() or isinstance(cause
, SocksProxyError
):
422 raise ProxyError(cause
=e
) from e
424 handle_response_read_exceptions(cause
)
425 raise TransportError(cause
=e
) from e
426 except (http
.client
.InvalidURL
, ValueError) as e
:
428 # http.client.HTTPConnection raises ValueError in some validation cases
429 # such as if request method contains illegal control characters [1]
430 # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
431 raise RequestError(cause
=e
) from e
432 except Exception as e
:
433 handle_response_read_exceptions(e
)
436 return UrllibResponseAdapter(res
)