[yt-dlp.git] / yt_dlp / networking / _urllib.py

from __future__ import annotations

import functools
import http.client
import io
import socket
import ssl
import urllib.error
import urllib.parse
import urllib.request
import urllib.response
import zlib
from urllib.request import (
    DataHandler,
    FileHandler,
    FTPHandler,
    HTTPCookieProcessor,
    HTTPDefaultErrorHandler,
    HTTPErrorProcessor,
    UnknownHandler,
)

from ._helper import (
    InstanceStoreMixin,
    add_accept_encoding_header,
    create_connection,
    get_redirect_method,
    make_socks_proxy_opts,
    select_proxy,
)
from .common import Features, RequestHandler, Response, register_rh
from .exceptions import (
    CertificateVerifyError,
    HTTPError,
    IncompleteRead,
    ProxyError,
    RequestError,
    SSLError,
    TransportError,
)
from ..dependencies import brotli
from ..socks import ProxyError as SocksProxyError
from ..socks import sockssocket
from ..utils import update_url_query
from ..utils.networking import normalize_url

SUPPORTED_ENCODINGS = ['gzip', 'deflate']
CONTENT_DECODE_ERRORS = [zlib.error, OSError]

if brotli:
    SUPPORTED_ENCODINGS.append('br')
    CONTENT_DECODE_ERRORS.append(brotli.error)


def _create_http_connection(http_class, source_address, *args, **kwargs):
    hc = http_class(*args, **kwargs)

    if hasattr(hc, '_create_connection'):
        hc._create_connection = create_connection

    if source_address is not None:
        hc.source_address = (source_address, 0)

    return hc


class HTTPHandler(urllib.request.AbstractHTTPHandler):
    """Handler for HTTP requests and responses.

    This class, when installed with an OpenerDirector, automatically adds
    the standard headers to every HTTP request and handles gzipped, deflated and
    brotli responses from web servers.

    Part of this code was copied from:

    http://techknack.net/python-urllib2-handlers/

    Andrew Rowls, the author of that code, agreed to release it to the
    public domain.
    """

    def __init__(self, context=None, source_address=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._source_address = source_address
        self._context = context

    @staticmethod
    def _make_conn_class(base, req):
        conn_class = base
        socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
        if socks_proxy:
            conn_class = make_socks_conn_class(conn_class, socks_proxy)
        return conn_class

    def http_open(self, req):
        conn_class = self._make_conn_class(http.client.HTTPConnection, req)
        return self.do_open(functools.partial(
            _create_http_connection, conn_class, self._source_address), req)

    def https_open(self, req):
        conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
        return self.do_open(
            functools.partial(
                _create_http_connection, conn_class, self._source_address),
            req, context=self._context)

    @staticmethod
    def deflate(data):
        if not data:
            return data
        try:
            return zlib.decompress(data, -zlib.MAX_WBITS)
        except zlib.error:
            return zlib.decompress(data)

    @staticmethod
    def brotli(data):
        if not data:
            return data
        return brotli.decompress(data)

    @staticmethod
    def gz(data):
        # There may be junk added the end of the file
        # We ignore it by only ever decoding a single gzip payload
        if not data:
            return data
        return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)

    def http_request(self, req):
        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
        # always respected by websites, some tend to give out URLs with non percent-encoded
        # non-ASCII characters (see telemb.py, ard.py [#3412])
        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
        # To work around aforementioned issue we will replace request's original URL with
        # percent-encoded one
        # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
        # the code of this workaround has been moved here from YoutubeDL.urlopen()
        url = req.get_full_url()
        url_escaped = normalize_url(url)

        # Substitute URL if any change after escaping
        if url != url_escaped:
            req = update_Request(req, url=url_escaped)

        return super().do_request_(req)

    def http_response(self, req, resp):
        old_resp = resp

        # Content-Encoding header lists the encodings in order that they were applied [1].
        # To decompress, we simply do the reverse.
        # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
        decoded_response = None
        for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
            if encoding == 'gzip':
                decoded_response = self.gz(decoded_response or resp.read())
            elif encoding == 'deflate':
                decoded_response = self.deflate(decoded_response or resp.read())
            elif encoding == 'br' and brotli:
                decoded_response = self.brotli(decoded_response or resp.read())

        if decoded_response is not None:
            resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
        # https://github.com/ytdl-org/youtube-dl/issues/6457).
        if 300 <= resp.code < 400:
            location = resp.headers.get('Location')
            if location:
                # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
                location = location.encode('iso-8859-1').decode()
                location_escaped = normalize_url(location)
                if location != location_escaped:
                    del resp.headers['Location']
                    resp.headers['Location'] = location_escaped
        return resp

    https_request = http_request
    https_response = http_response


def make_socks_conn_class(base_class, socks_proxy):
    assert issubclass(base_class, (
        http.client.HTTPConnection, http.client.HTTPSConnection))

    proxy_args = make_socks_proxy_opts(socks_proxy)

    class SocksConnection(base_class):
        _create_connection = create_connection

        def connect(self):
            def sock_socket_connect(ip_addr, timeout, source_address):
                af, socktype, proto, canonname, sa = ip_addr
                sock = sockssocket(af, socktype, proto)
                try:
                    connect_proxy_args = proxy_args.copy()
                    connect_proxy_args.update({'addr': sa[0], 'port': sa[1]})
                    sock.setproxy(**connect_proxy_args)
                    if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:  # noqa: E721
                        sock.settimeout(timeout)
                    if source_address:
                        sock.bind(source_address)
                    sock.connect((self.host, self.port))
                    return sock
                except socket.error:
                    sock.close()
                    raise
            self.sock = create_connection(
                (proxy_args['addr'], proxy_args['port']), timeout=self.timeout,
                source_address=self.source_address, _create_socket_func=sock_socket_connect)
            if isinstance(self, http.client.HTTPSConnection):
                self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)

    return SocksConnection


class RedirectHandler(urllib.request.HTTPRedirectHandler):
    """YoutubeDL redirect handler

    The code is based on HTTPRedirectHandler implementation from CPython [1].

    This redirect handler fixes and improves the logic to better align with RFC7261
     and what browsers tend to do [2][3]

    1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
    2. https://datatracker.ietf.org/doc/html/rfc7231
    3. https://github.com/python/cpython/issues/91306
    """

    http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302

    def redirect_request(self, req, fp, code, msg, headers, newurl):
        if code not in (301, 302, 303, 307, 308):
            raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)

        new_data = req.data

        # Technically the Cookie header should be in unredirected_hdrs,
        # however in practice some may set it in normal headers anyway.
        # We will remove it here to prevent any leaks.
        remove_headers = ['Cookie']

        new_method = get_redirect_method(req.get_method(), code)
        # only remove payload if method changed (e.g. POST to GET)
        if new_method != req.get_method():
            new_data = None
            remove_headers.extend(['Content-Length', 'Content-Type'])

        new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}

        return urllib.request.Request(
            newurl, headers=new_headers, origin_req_host=req.origin_req_host,
            unverifiable=True, method=new_method, data=new_data)


class ProxyHandler(urllib.request.BaseHandler):
    handler_order = 100

    def __init__(self, proxies=None):
        self.proxies = proxies
        # Set default handlers
        for type in ('http', 'https', 'ftp'):
            setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))

    def proxy_open(self, req):
        proxy = select_proxy(req.get_full_url(), self.proxies)
        if proxy is None:
            return
        if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
            req.add_header('Ytdl-socks-proxy', proxy)
            # yt-dlp's http/https handlers do wrapping the socket with socks
            return None
        return urllib.request.ProxyHandler.proxy_open(
            self, req, proxy, None)


class PUTRequest(urllib.request.Request):
    def get_method(self):
        return 'PUT'


class HEADRequest(urllib.request.Request):
    def get_method(self):
        return 'HEAD'


def update_Request(req, url=None, data=None, headers=None, query=None):
    req_headers = req.headers.copy()
    req_headers.update(headers or {})
    req_data = data if data is not None else req.data
    req_url = update_url_query(url or req.get_full_url(), query)
    req_get_method = req.get_method()
    if req_get_method == 'HEAD':
        req_type = HEADRequest
    elif req_get_method == 'PUT':
        req_type = PUTRequest
    else:
        req_type = urllib.request.Request
    new_req = req_type(
        req_url, data=req_data, headers=req_headers,
        origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
    if hasattr(req, 'timeout'):
        new_req.timeout = req.timeout
    return new_req


class UrllibResponseAdapter(Response):
    """
    HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
    """

    def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
        # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
        # HTTPResponse: .getcode() was deprecated, .status always existed [2]
        # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
        # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
        super().__init__(
            fp=res, headers=res.headers, url=res.url,
            status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))

    def read(self, amt=None):
        try:
            return self.fp.read(amt)
        except Exception as e:
            handle_response_read_exceptions(e)
            raise e


def handle_sslerror(e: ssl.SSLError):
    if not isinstance(e, ssl.SSLError):
        return
    if isinstance(e, ssl.SSLCertVerificationError):
        raise CertificateVerifyError(cause=e) from e
    raise SSLError(cause=e) from e


def handle_response_read_exceptions(e):
    if isinstance(e, http.client.IncompleteRead):
        raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
    elif isinstance(e, ssl.SSLError):
        handle_sslerror(e)
    elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
        # OSErrors raised here should mostly be network related
        raise TransportError(cause=e) from e


@register_rh
class UrllibRH(RequestHandler, InstanceStoreMixin):
    _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
    _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
    _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
    RH_NAME = 'urllib'

    def __init__(self, *, enable_file_urls: bool = False, **kwargs):
        super().__init__(**kwargs)
        self.enable_file_urls = enable_file_urls
        if self.enable_file_urls:
            self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')

    def _check_extensions(self, extensions):
        super()._check_extensions(extensions)
        extensions.pop('cookiejar', None)
        extensions.pop('timeout', None)

    def _create_instance(self, proxies, cookiejar):
        opener = urllib.request.OpenerDirector()
        handlers = [
            ProxyHandler(proxies),
            HTTPHandler(
                debuglevel=int(bool(self.verbose)),
                context=self._make_sslcontext(),
                source_address=self.source_address),
            HTTPCookieProcessor(cookiejar),
            DataHandler(),
            UnknownHandler(),
            HTTPDefaultErrorHandler(),
            FTPHandler(),
            HTTPErrorProcessor(),
            RedirectHandler(),
        ]

        if self.enable_file_urls:
            handlers.append(FileHandler())

        for handler in handlers:
            opener.add_handler(handler)

        # Delete the default user-agent header, which would otherwise apply in
        # cases where our custom HTTP handler doesn't come into play
        # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
        opener.addheaders = []
        return opener

    def _send(self, request):
        headers = self._merge_headers(request.headers)
        add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
        urllib_req = urllib.request.Request(
            url=request.url,
            data=request.data,
            headers=dict(headers),
            method=request.method
        )

        opener = self._get_instance(
            proxies=request.proxies or self.proxies,
            cookiejar=request.extensions.get('cookiejar') or self.cookiejar
        )
        try:
            res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
        except urllib.error.HTTPError as e:
            if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
                # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
                e._closer.close_called = True
                raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
            raise  # unexpected
        except urllib.error.URLError as e:
            cause = e.reason  # NOTE: cause may be a string

            # proxy errors
            if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
                raise ProxyError(cause=e) from e

            handle_response_read_exceptions(cause)
            raise TransportError(cause=e) from e
        except (http.client.InvalidURL, ValueError) as e:
            # Validation errors
            # http.client.HTTPConnection raises ValueError in some validation cases
            # such as if request method contains illegal control characters [1]
            # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
            raise RequestError(cause=e) from e
        except Exception as e:
            handle_response_read_exceptions(e)
            raise  # unexpected

        return UrllibResponseAdapter(res)
Commit	Line	Data
227bf1a3	1	from __future__ import annotations
227bf1a3	2
c365dba8	3	import functools
c365dba8	4	import http.client
	5	import io
	6	import socket
	7	import ssl
	8	import urllib.error
	9	import urllib.parse
	10	import urllib.request
	11	import urllib.response
	12	import zlib
227bf1a3	13	from urllib.request import (
	14	DataHandler,
	15	FileHandler,
	16	FTPHandler,
	17	HTTPCookieProcessor,
	18	HTTPDefaultErrorHandler,
	19	HTTPErrorProcessor,
	20	UnknownHandler,
	21	)
c365dba8	22
c365dba8	23	from ._helper import (
227bf1a3	24	InstanceStoreMixin,
c365dba8	25	add_accept_encoding_header,
20fbbd92	26	create_connection,
c365dba8	27	get_redirect_method,
c365dba8	28	make_socks_proxy_opts,
227bf1a3	29	select_proxy,
227bf1a3	30	)
62b5c94c	31	from .common import Features, RequestHandler, Response, register_rh
227bf1a3	32	from .exceptions import (
	33	CertificateVerifyError,
	34	HTTPError,
	35	IncompleteRead,
	36	ProxyError,
	37	RequestError,
	38	SSLError,
	39	TransportError,
c365dba8	40	)
c365dba8	41	from ..dependencies import brotli
227bf1a3	42	from ..socks import ProxyError as SocksProxyError
c365dba8	43	from ..socks import sockssocket
4bf91228	44	from ..utils import update_url_query
4bf91228	45	from ..utils.networking import normalize_url
c365dba8	46
c365dba8	47	SUPPORTED_ENCODINGS = ['gzip', 'deflate']
227bf1a3	48	CONTENT_DECODE_ERRORS = [zlib.error, OSError]
c365dba8	49
	50	if brotli:
	51	SUPPORTED_ENCODINGS.append('br')
227bf1a3	52	CONTENT_DECODE_ERRORS.append(brotli.error)
c365dba8	53
c365dba8	54
227bf1a3	55	def _create_http_connection(http_class, source_address, args, *kwargs):
c365dba8	56	hc = http_class(args, *kwargs)
c365dba8	57
20fbbd92	58	if hasattr(hc, '_create_connection'):
	59	hc._create_connection = create_connection
	60
c365dba8	61	if source_address is not None:
c365dba8	62	hc.source_address = (source_address, 0)
	63
	64	return hc
	65
	66
227bf1a3	67	class HTTPHandler(urllib.request.AbstractHTTPHandler):
c365dba8	68	"""Handler for HTTP requests and responses.
	69
	70	This class, when installed with an OpenerDirector, automatically adds
	71	the standard headers to every HTTP request and handles gzipped, deflated and
	72	brotli responses from web servers.
	73
	74	Part of this code was copied from:
	75
	76	http://techknack.net/python-urllib2-handlers/
	77
	78	Andrew Rowls, the author of that code, agreed to release it to the
	79	public domain.
	80	"""
	81
227bf1a3	82	def __init__(self, context=None, source_address=None, args, *kwargs):
	83	super().__init__(args, *kwargs)
	84	self._source_address = source_address
	85	self._context = context
c365dba8	86
227bf1a3	87	@staticmethod
	88	def _make_conn_class(base, req):
	89	conn_class = base
	90	socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
c365dba8	91	if socks_proxy:
c365dba8	92	conn_class = make_socks_conn_class(conn_class, socks_proxy)
227bf1a3	93	return conn_class
c365dba8	94
227bf1a3	95	def http_open(self, req):
227bf1a3	96	conn_class = self._make_conn_class(http.client.HTTPConnection, req)
c365dba8	97	return self.do_open(functools.partial(
227bf1a3	98	_create_http_connection, conn_class, self._source_address), req)
	99
	100	def https_open(self, req):
	101	conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
	102	return self.do_open(
	103	functools.partial(
	104	_create_http_connection, conn_class, self._source_address),
	105	req, context=self._context)
c365dba8	106
	107	@staticmethod
	108	def deflate(data):
	109	if not data:
	110	return data
	111	try:
	112	return zlib.decompress(data, -zlib.MAX_WBITS)
	113	except zlib.error:
	114	return zlib.decompress(data)
	115
	116	@staticmethod
	117	def brotli(data):
	118	if not data:
	119	return data
	120	return brotli.decompress(data)
	121
	122	@staticmethod
	123	def gz(data):
59e92b1f SS	124	# There may be junk added the end of the file
59e92b1f SS	125	# We ignore it by only ever decoding a single gzip payload
77bff23e SS	126	if not data:
77bff23e SS	127	return data
59e92b1f	128	return zlib.decompress(data, wbits=zlib.MAX_WBITS \| 16)
c365dba8	129
	130	def http_request(self, req):
	131	# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
	132	# always respected by websites, some tend to give out URLs with non percent-encoded
	133	# non-ASCII characters (see telemb.py, ard.py [#3412])
	134	# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
	135	# To work around aforementioned issue we will replace request's original URL with
	136	# percent-encoded one
	137	# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
	138	# the code of this workaround has been moved here from YoutubeDL.urlopen()
	139	url = req.get_full_url()
4bf91228	140	url_escaped = normalize_url(url)
c365dba8	141
	142	# Substitute URL if any change after escaping
	143	if url != url_escaped:
	144	req = update_Request(req, url=url_escaped)
	145
c365dba8	146	return super().do_request_(req)
	147
	148	def http_response(self, req, resp):
	149	old_resp = resp
	150
	151	# Content-Encoding header lists the encodings in order that they were applied [1].
	152	# To decompress, we simply do the reverse.
	153	# [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
	154	decoded_response = None
	155	for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
	156	if encoding == 'gzip':
	157	decoded_response = self.gz(decoded_response or resp.read())
	158	elif encoding == 'deflate':
	159	decoded_response = self.deflate(decoded_response or resp.read())
	160	elif encoding == 'br' and brotli:
	161	decoded_response = self.brotli(decoded_response or resp.read())
	162
	163	if decoded_response is not None:
	164	resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
	165	resp.msg = old_resp.msg
	166	# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
	167	# https://github.com/ytdl-org/youtube-dl/issues/6457).
	168	if 300 <= resp.code < 400:
	169	location = resp.headers.get('Location')
	170	if location:
	171	# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
	172	location = location.encode('iso-8859-1').decode()
4bf91228	173	location_escaped = normalize_url(location)
c365dba8	174	if location != location_escaped:
	175	del resp.headers['Location']
	176	resp.headers['Location'] = location_escaped
	177	return resp
	178
	179	https_request = http_request
	180	https_response = http_response
	181
	182
	183	def make_socks_conn_class(base_class, socks_proxy):
	184	assert issubclass(base_class, (
	185	http.client.HTTPConnection, http.client.HTTPSConnection))
	186
	187	proxy_args = make_socks_proxy_opts(socks_proxy)
	188
	189	class SocksConnection(base_class):
20fbbd92	190	_create_connection = create_connection
c365dba8	191
20fbbd92	192	def connect(self):
	193	def sock_socket_connect(ip_addr, timeout, source_address):
	194	af, socktype, proto, canonname, sa = ip_addr
	195	sock = sockssocket(af, socktype, proto)
	196	try:
	197	connect_proxy_args = proxy_args.copy()
	198	connect_proxy_args.update({'addr': sa[0], 'port': sa[1]})
	199	sock.setproxy(**connect_proxy_args)
	200	if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721
	201	sock.settimeout(timeout)
	202	if source_address:
	203	sock.bind(source_address)
	204	sock.connect((self.host, self.port))
	205	return sock
	206	except socket.error:
	207	sock.close()
	208	raise
	209	self.sock = create_connection(
	210	(proxy_args['addr'], proxy_args['port']), timeout=self.timeout,
	211	source_address=self.source_address, _create_socket_func=sock_socket_connect)
c365dba8	212	if isinstance(self, http.client.HTTPSConnection):
227bf1a3	213	self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
c365dba8	214
	215	return SocksConnection
	216
	217
	218	class RedirectHandler(urllib.request.HTTPRedirectHandler):
	219	"""YoutubeDL redirect handler
	220
	221	The code is based on HTTPRedirectHandler implementation from CPython [1].
	222
	223	This redirect handler fixes and improves the logic to better align with RFC7261
	224	and what browsers tend to do [2][3]
	225
	226	1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
	227	2. https://datatracker.ietf.org/doc/html/rfc7231
	228	3. https://github.com/python/cpython/issues/91306
	229	"""
	230
	231	http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
	232
	233	def redirect_request(self, req, fp, code, msg, headers, newurl):
	234	if code not in (301, 302, 303, 307, 308):
	235	raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
	236
	237	new_data = req.data
	238
	239	# Technically the Cookie header should be in unredirected_hdrs,
	240	# however in practice some may set it in normal headers anyway.
	241	# We will remove it here to prevent any leaks.
	242	remove_headers = ['Cookie']
	243
	244	new_method = get_redirect_method(req.get_method(), code)
	245	# only remove payload if method changed (e.g. POST to GET)
	246	if new_method != req.get_method():
	247	new_data = None
	248	remove_headers.extend(['Content-Length', 'Content-Type'])
	249
	250	new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
	251
	252	return urllib.request.Request(
	253	newurl, headers=new_headers, origin_req_host=req.origin_req_host,
	254	unverifiable=True, method=new_method, data=new_data)
	255
	256
227bf1a3	257	class ProxyHandler(urllib.request.BaseHandler):
	258	handler_order = 100
	259
c365dba8	260	def __init__(self, proxies=None):
227bf1a3	261	self.proxies = proxies
c365dba8	262	# Set default handlers
227bf1a3	263	for type in ('http', 'https', 'ftp'):
	264	setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
	265
	266	def proxy_open(self, req):
	267	proxy = select_proxy(req.get_full_url(), self.proxies)
	268	if proxy is None:
	269	return
	270	if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
c365dba8	271	req.add_header('Ytdl-socks-proxy', proxy)
	272	# yt-dlp's http/https handlers do wrapping the socket with socks
	273	return None
	274	return urllib.request.ProxyHandler.proxy_open(
227bf1a3	275	self, req, proxy, None)
c365dba8	276
	277
	278	class PUTRequest(urllib.request.Request):
	279	def get_method(self):
	280	return 'PUT'
	281
	282
	283	class HEADRequest(urllib.request.Request):
	284	def get_method(self):
	285	return 'HEAD'
	286
	287
	288	def update_Request(req, url=None, data=None, headers=None, query=None):
	289	req_headers = req.headers.copy()
	290	req_headers.update(headers or {})
71baa490	291	req_data = data if data is not None else req.data
c365dba8	292	req_url = update_url_query(url or req.get_full_url(), query)
	293	req_get_method = req.get_method()
	294	if req_get_method == 'HEAD':
	295	req_type = HEADRequest
	296	elif req_get_method == 'PUT':
	297	req_type = PUTRequest
	298	else:
	299	req_type = urllib.request.Request
	300	new_req = req_type(
	301	req_url, data=req_data, headers=req_headers,
	302	origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
	303	if hasattr(req, 'timeout'):
	304	new_req.timeout = req.timeout
	305	return new_req
227bf1a3	306
	307
	308	class UrllibResponseAdapter(Response):
	309	"""
	310	HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
	311	"""
	312
	313	def __init__(self, res: http.client.HTTPResponse \| urllib.response.addinfourl):
	314	# addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
	315	# HTTPResponse: .getcode() was deprecated, .status always existed [2]
	316	# 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
	317	# 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
	318	super().__init__(
	319	fp=res, headers=res.headers, url=res.url,
	320	status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
	321
	322	def read(self, amt=None):
	323	try:
	324	return self.fp.read(amt)
	325	except Exception as e:
	326	handle_response_read_exceptions(e)
	327	raise e
	328
	329
	330	def handle_sslerror(e: ssl.SSLError):
	331	if not isinstance(e, ssl.SSLError):
	332	return
	333	if isinstance(e, ssl.SSLCertVerificationError):
	334	raise CertificateVerifyError(cause=e) from e
	335	raise SSLError(cause=e) from e
	336
	337
	338	def handle_response_read_exceptions(e):
	339	if isinstance(e, http.client.IncompleteRead):
5ca095cb	340	raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
227bf1a3	341	elif isinstance(e, ssl.SSLError):
	342	handle_sslerror(e)
	343	elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
	344	# OSErrors raised here should mostly be network related
	345	raise TransportError(cause=e) from e
	346
	347
62b5c94c	348	@register_rh
227bf1a3	349	class UrllibRH(RequestHandler, InstanceStoreMixin):
	350	_SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
	351	_SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
	352	_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
	353	RH_NAME = 'urllib'
	354
	355	def __init__(self, , enable_file_urls: bool = False, *kwargs):
	356	super().__init__(**kwargs)
	357	self.enable_file_urls = enable_file_urls
	358	if self.enable_file_urls:
	359	self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
	360
86aea0d3	361	def _check_extensions(self, extensions):
	362	super()._check_extensions(extensions)
	363	extensions.pop('cookiejar', None)
	364	extensions.pop('timeout', None)
	365
227bf1a3	366	def _create_instance(self, proxies, cookiejar):
	367	opener = urllib.request.OpenerDirector()
	368	handlers = [
	369	ProxyHandler(proxies),
	370	HTTPHandler(
	371	debuglevel=int(bool(self.verbose)),
	372	context=self._make_sslcontext(),
	373	source_address=self.source_address),
	374	HTTPCookieProcessor(cookiejar),
	375	DataHandler(),
	376	UnknownHandler(),
	377	HTTPDefaultErrorHandler(),
	378	FTPHandler(),
	379	HTTPErrorProcessor(),
	380	RedirectHandler(),
	381	]
	382
	383	if self.enable_file_urls:
	384	handlers.append(FileHandler())
	385
	386	for handler in handlers:
	387	opener.add_handler(handler)
	388
	389	# Delete the default user-agent header, which would otherwise apply in
	390	# cases where our custom HTTP handler doesn't come into play
	391	# (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
	392	opener.addheaders = []
	393	return opener
	394
	395	def _send(self, request):
	396	headers = self._merge_headers(request.headers)
	397	add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
	398	urllib_req = urllib.request.Request(
	399	url=request.url,
	400	data=request.data,
	401	headers=dict(headers),
	402	method=request.method
	403	)
	404
	405	opener = self._get_instance(
	406	proxies=request.proxies or self.proxies,
	407	cookiejar=request.extensions.get('cookiejar') or self.cookiejar
	408	)
	409	try:
	410	res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
	411	except urllib.error.HTTPError as e:
	412	if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
	413	# Prevent file object from being closed when urllib.error.HTTPError is destroyed.
836e06d2	414	e._closer.close_called = True
227bf1a3	415	raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
	416	raise # unexpected
	417	except urllib.error.URLError as e:
	418	cause = e.reason # NOTE: cause may be a string
	419
	420	# proxy errors
	421	if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
	422	raise ProxyError(cause=e) from e
	423
	424	handle_response_read_exceptions(cause)
	425	raise TransportError(cause=e) from e
	426	except (http.client.InvalidURL, ValueError) as e:
	427	# Validation errors
	428	# http.client.HTTPConnection raises ValueError in some validation cases
	429	# such as if request method contains illegal control characters [1]
	430	# 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
	431	raise RequestError(cause=e) from e
	432	except Exception as e:
	433	handle_response_read_exceptions(e)
	434	raise # unexpected
	435
	436	return UrllibResponseAdapter(res)