[yt-dlp.git] / yt_dlp / networking / _urllib.py

from __future__ import annotations

import functools
import http.client
import io
import socket
import ssl
import urllib.error
import urllib.parse
import urllib.request
import urllib.response
import zlib
from urllib.request import (
    DataHandler,
    FileHandler,
    FTPHandler,
    HTTPCookieProcessor,
    HTTPDefaultErrorHandler,
    HTTPErrorProcessor,
    UnknownHandler,
)

from ._helper import (
    InstanceStoreMixin,
    add_accept_encoding_header,
    get_redirect_method,
    make_socks_proxy_opts,
    select_proxy,
)
from .common import Features, RequestHandler, Response, register_rh
from .exceptions import (
    CertificateVerifyError,
    HTTPError,
    IncompleteRead,
    ProxyError,
    RequestError,
    SSLError,
    TransportError,
)
from ..dependencies import brotli
from ..socks import ProxyError as SocksProxyError
from ..socks import sockssocket
from ..utils import update_url_query
from ..utils.networking import normalize_url

SUPPORTED_ENCODINGS = ['gzip', 'deflate']
CONTENT_DECODE_ERRORS = [zlib.error, OSError]

if brotli:
    SUPPORTED_ENCODINGS.append('br')
    CONTENT_DECODE_ERRORS.append(brotli.error)


def _create_http_connection(http_class, source_address, *args, **kwargs):
    hc = http_class(*args, **kwargs)

    if source_address is not None:
        # This is to workaround _create_connection() from socket where it will try all
        # address data from getaddrinfo() including IPv6. This filters the result from
        # getaddrinfo() based on the source_address value.
        # This is based on the cpython socket.create_connection() function.
        # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
        def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
            host, port = address
            err = None
            addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
            af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
            ip_addrs = [addr for addr in addrs if addr[0] == af]
            if addrs and not ip_addrs:
                ip_version = 'v4' if af == socket.AF_INET else 'v6'
                raise OSError(
                    "No remote IP%s addresses available for connect, can't use '%s' as source address"
                    % (ip_version, source_address[0]))
            for res in ip_addrs:
                af, socktype, proto, canonname, sa = res
                sock = None
                try:
                    sock = socket.socket(af, socktype, proto)
                    if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
                        sock.settimeout(timeout)
                    sock.bind(source_address)
                    sock.connect(sa)
                    err = None  # Explicitly break reference cycle
                    return sock
                except OSError as _:
                    err = _
                    if sock is not None:
                        sock.close()
            if err is not None:
                raise err
            else:
                raise OSError('getaddrinfo returns an empty list')
        if hasattr(hc, '_create_connection'):
            hc._create_connection = _create_connection
        hc.source_address = (source_address, 0)

    return hc


class HTTPHandler(urllib.request.AbstractHTTPHandler):
    """Handler for HTTP requests and responses.

    This class, when installed with an OpenerDirector, automatically adds
    the standard headers to every HTTP request and handles gzipped, deflated and
    brotli responses from web servers.

    Part of this code was copied from:

    http://techknack.net/python-urllib2-handlers/

    Andrew Rowls, the author of that code, agreed to release it to the
    public domain.
    """

    def __init__(self, context=None, source_address=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._source_address = source_address
        self._context = context

    @staticmethod
    def _make_conn_class(base, req):
        conn_class = base
        socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
        if socks_proxy:
            conn_class = make_socks_conn_class(conn_class, socks_proxy)
        return conn_class

    def http_open(self, req):
        conn_class = self._make_conn_class(http.client.HTTPConnection, req)
        return self.do_open(functools.partial(
            _create_http_connection, conn_class, self._source_address), req)

    def https_open(self, req):
        conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
        return self.do_open(
            functools.partial(
                _create_http_connection, conn_class, self._source_address),
            req, context=self._context)

    @staticmethod
    def deflate(data):
        if not data:
            return data
        try:
            return zlib.decompress(data, -zlib.MAX_WBITS)
        except zlib.error:
            return zlib.decompress(data)

    @staticmethod
    def brotli(data):
        if not data:
            return data
        return brotli.decompress(data)

    @staticmethod
    def gz(data):
        # There may be junk added the end of the file
        # We ignore it by only ever decoding a single gzip payload
        if not data:
            return data
        return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)

    def http_request(self, req):
        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
        # always respected by websites, some tend to give out URLs with non percent-encoded
        # non-ASCII characters (see telemb.py, ard.py [#3412])
        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
        # To work around aforementioned issue we will replace request's original URL with
        # percent-encoded one
        # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
        # the code of this workaround has been moved here from YoutubeDL.urlopen()
        url = req.get_full_url()
        url_escaped = normalize_url(url)

        # Substitute URL if any change after escaping
        if url != url_escaped:
            req = update_Request(req, url=url_escaped)

        return super().do_request_(req)

    def http_response(self, req, resp):
        old_resp = resp

        # Content-Encoding header lists the encodings in order that they were applied [1].
        # To decompress, we simply do the reverse.
        # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
        decoded_response = None
        for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
            if encoding == 'gzip':
                decoded_response = self.gz(decoded_response or resp.read())
            elif encoding == 'deflate':
                decoded_response = self.deflate(decoded_response or resp.read())
            elif encoding == 'br' and brotli:
                decoded_response = self.brotli(decoded_response or resp.read())

        if decoded_response is not None:
            resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
        # https://github.com/ytdl-org/youtube-dl/issues/6457).
        if 300 <= resp.code < 400:
            location = resp.headers.get('Location')
            if location:
                # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
                location = location.encode('iso-8859-1').decode()
                location_escaped = normalize_url(location)
                if location != location_escaped:
                    del resp.headers['Location']
                    resp.headers['Location'] = location_escaped
        return resp

    https_request = http_request
    https_response = http_response


def make_socks_conn_class(base_class, socks_proxy):
    assert issubclass(base_class, (
        http.client.HTTPConnection, http.client.HTTPSConnection))

    proxy_args = make_socks_proxy_opts(socks_proxy)

    class SocksConnection(base_class):
        def connect(self):
            self.sock = sockssocket()
            self.sock.setproxy(**proxy_args)
            if type(self.timeout) in (int, float):  # noqa: E721
                self.sock.settimeout(self.timeout)
            self.sock.connect((self.host, self.port))

            if isinstance(self, http.client.HTTPSConnection):
                self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)

    return SocksConnection


class RedirectHandler(urllib.request.HTTPRedirectHandler):
    """YoutubeDL redirect handler

    The code is based on HTTPRedirectHandler implementation from CPython [1].

    This redirect handler fixes and improves the logic to better align with RFC7261
     and what browsers tend to do [2][3]

    1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
    2. https://datatracker.ietf.org/doc/html/rfc7231
    3. https://github.com/python/cpython/issues/91306
    """

    http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302

    def redirect_request(self, req, fp, code, msg, headers, newurl):
        if code not in (301, 302, 303, 307, 308):
            raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)

        new_data = req.data

        # Technically the Cookie header should be in unredirected_hdrs,
        # however in practice some may set it in normal headers anyway.
        # We will remove it here to prevent any leaks.
        remove_headers = ['Cookie']

        new_method = get_redirect_method(req.get_method(), code)
        # only remove payload if method changed (e.g. POST to GET)
        if new_method != req.get_method():
            new_data = None
            remove_headers.extend(['Content-Length', 'Content-Type'])

        new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}

        return urllib.request.Request(
            newurl, headers=new_headers, origin_req_host=req.origin_req_host,
            unverifiable=True, method=new_method, data=new_data)


class ProxyHandler(urllib.request.BaseHandler):
    handler_order = 100

    def __init__(self, proxies=None):
        self.proxies = proxies
        # Set default handlers
        for type in ('http', 'https', 'ftp'):
            setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))

    def proxy_open(self, req):
        proxy = select_proxy(req.get_full_url(), self.proxies)
        if proxy is None:
            return
        if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
            req.add_header('Ytdl-socks-proxy', proxy)
            # yt-dlp's http/https handlers do wrapping the socket with socks
            return None
        return urllib.request.ProxyHandler.proxy_open(
            self, req, proxy, None)


class PUTRequest(urllib.request.Request):
    def get_method(self):
        return 'PUT'


class HEADRequest(urllib.request.Request):
    def get_method(self):
        return 'HEAD'


def update_Request(req, url=None, data=None, headers=None, query=None):
    req_headers = req.headers.copy()
    req_headers.update(headers or {})
    req_data = data if data is not None else req.data
    req_url = update_url_query(url or req.get_full_url(), query)
    req_get_method = req.get_method()
    if req_get_method == 'HEAD':
        req_type = HEADRequest
    elif req_get_method == 'PUT':
        req_type = PUTRequest
    else:
        req_type = urllib.request.Request
    new_req = req_type(
        req_url, data=req_data, headers=req_headers,
        origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
    if hasattr(req, 'timeout'):
        new_req.timeout = req.timeout
    return new_req


class UrllibResponseAdapter(Response):
    """
    HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
    """

    def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
        # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
        # HTTPResponse: .getcode() was deprecated, .status always existed [2]
        # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
        # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
        super().__init__(
            fp=res, headers=res.headers, url=res.url,
            status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))

    def read(self, amt=None):
        try:
            return self.fp.read(amt)
        except Exception as e:
            handle_response_read_exceptions(e)
            raise e


def handle_sslerror(e: ssl.SSLError):
    if not isinstance(e, ssl.SSLError):
        return
    if isinstance(e, ssl.SSLCertVerificationError):
        raise CertificateVerifyError(cause=e) from e
    raise SSLError(cause=e) from e


def handle_response_read_exceptions(e):
    if isinstance(e, http.client.IncompleteRead):
        raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
    elif isinstance(e, ssl.SSLError):
        handle_sslerror(e)
    elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
        # OSErrors raised here should mostly be network related
        raise TransportError(cause=e) from e


@register_rh
class UrllibRH(RequestHandler, InstanceStoreMixin):
    _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
    _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
    _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
    RH_NAME = 'urllib'

    def __init__(self, *, enable_file_urls: bool = False, **kwargs):
        super().__init__(**kwargs)
        self.enable_file_urls = enable_file_urls
        if self.enable_file_urls:
            self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')

    def _check_extensions(self, extensions):
        super()._check_extensions(extensions)
        extensions.pop('cookiejar', None)
        extensions.pop('timeout', None)

    def _create_instance(self, proxies, cookiejar):
        opener = urllib.request.OpenerDirector()
        handlers = [
            ProxyHandler(proxies),
            HTTPHandler(
                debuglevel=int(bool(self.verbose)),
                context=self._make_sslcontext(),
                source_address=self.source_address),
            HTTPCookieProcessor(cookiejar),
            DataHandler(),
            UnknownHandler(),
            HTTPDefaultErrorHandler(),
            FTPHandler(),
            HTTPErrorProcessor(),
            RedirectHandler(),
        ]

        if self.enable_file_urls:
            handlers.append(FileHandler())

        for handler in handlers:
            opener.add_handler(handler)

        # Delete the default user-agent header, which would otherwise apply in
        # cases where our custom HTTP handler doesn't come into play
        # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
        opener.addheaders = []
        return opener

    def _send(self, request):
        headers = self._merge_headers(request.headers)
        add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
        urllib_req = urllib.request.Request(
            url=request.url,
            data=request.data,
            headers=dict(headers),
            method=request.method
        )

        opener = self._get_instance(
            proxies=request.proxies or self.proxies,
            cookiejar=request.extensions.get('cookiejar') or self.cookiejar
        )
        try:
            res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
        except urllib.error.HTTPError as e:
            if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
                # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
                e._closer.close_called = True
                raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
            raise  # unexpected
        except urllib.error.URLError as e:
            cause = e.reason  # NOTE: cause may be a string

            # proxy errors
            if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
                raise ProxyError(cause=e) from e

            handle_response_read_exceptions(cause)
            raise TransportError(cause=e) from e
        except (http.client.InvalidURL, ValueError) as e:
            # Validation errors
            # http.client.HTTPConnection raises ValueError in some validation cases
            # such as if request method contains illegal control characters [1]
            # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
            raise RequestError(cause=e) from e
        except Exception as e:
            handle_response_read_exceptions(e)
            raise  # unexpected

        return UrllibResponseAdapter(res)
Commit	Line	Data
227bf1a3	1	from __future__ import annotations
227bf1a3	2
c365dba8	3	import functools
c365dba8	4	import http.client
	5	import io
	6	import socket
	7	import ssl
	8	import urllib.error
	9	import urllib.parse
	10	import urllib.request
	11	import urllib.response
	12	import zlib
227bf1a3	13	from urllib.request import (
	14	DataHandler,
	15	FileHandler,
	16	FTPHandler,
	17	HTTPCookieProcessor,
	18	HTTPDefaultErrorHandler,
	19	HTTPErrorProcessor,
	20	UnknownHandler,
	21	)
c365dba8	22
c365dba8	23	from ._helper import (
227bf1a3	24	InstanceStoreMixin,
c365dba8	25	add_accept_encoding_header,
	26	get_redirect_method,
	27	make_socks_proxy_opts,
227bf1a3	28	select_proxy,
227bf1a3	29	)
62b5c94c	30	from .common import Features, RequestHandler, Response, register_rh
227bf1a3	31	from .exceptions import (
	32	CertificateVerifyError,
	33	HTTPError,
	34	IncompleteRead,
	35	ProxyError,
	36	RequestError,
	37	SSLError,
	38	TransportError,
c365dba8	39	)
c365dba8	40	from ..dependencies import brotli
227bf1a3	41	from ..socks import ProxyError as SocksProxyError
c365dba8	42	from ..socks import sockssocket
4bf91228	43	from ..utils import update_url_query
4bf91228	44	from ..utils.networking import normalize_url
c365dba8	45
c365dba8	46	SUPPORTED_ENCODINGS = ['gzip', 'deflate']
227bf1a3	47	CONTENT_DECODE_ERRORS = [zlib.error, OSError]
c365dba8	48
	49	if brotli:
	50	SUPPORTED_ENCODINGS.append('br')
227bf1a3	51	CONTENT_DECODE_ERRORS.append(brotli.error)
c365dba8	52
c365dba8	53
227bf1a3	54	def _create_http_connection(http_class, source_address, args, *kwargs):
c365dba8	55	hc = http_class(args, *kwargs)
c365dba8	56
	57	if source_address is not None:
	58	# This is to workaround _create_connection() from socket where it will try all
	59	# address data from getaddrinfo() including IPv6. This filters the result from
	60	# getaddrinfo() based on the source_address value.
	61	# This is based on the cpython socket.create_connection() function.
	62	# https://github.com/python/cpython/blob/master/Lib/socket.py#L691
	63	def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
	64	host, port = address
	65	err = None
	66	addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
	67	af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
	68	ip_addrs = [addr for addr in addrs if addr[0] == af]
	69	if addrs and not ip_addrs:
	70	ip_version = 'v4' if af == socket.AF_INET else 'v6'
	71	raise OSError(
	72	"No remote IP%s addresses available for connect, can't use '%s' as source address"
	73	% (ip_version, source_address[0]))
	74	for res in ip_addrs:
	75	af, socktype, proto, canonname, sa = res
	76	sock = None
	77	try:
	78	sock = socket.socket(af, socktype, proto)
	79	if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
	80	sock.settimeout(timeout)
	81	sock.bind(source_address)
	82	sock.connect(sa)
	83	err = None # Explicitly break reference cycle
	84	return sock
	85	except OSError as _:
	86	err = _
	87	if sock is not None:
	88	sock.close()
	89	if err is not None:
	90	raise err
	91	else:
	92	raise OSError('getaddrinfo returns an empty list')
	93	if hasattr(hc, '_create_connection'):
	94	hc._create_connection = _create_connection
	95	hc.source_address = (source_address, 0)
	96
	97	return hc
	98
	99
227bf1a3	100	class HTTPHandler(urllib.request.AbstractHTTPHandler):
c365dba8	101	"""Handler for HTTP requests and responses.
	102
	103	This class, when installed with an OpenerDirector, automatically adds
	104	the standard headers to every HTTP request and handles gzipped, deflated and
	105	brotli responses from web servers.
	106
	107	Part of this code was copied from:
	108
	109	http://techknack.net/python-urllib2-handlers/
	110
	111	Andrew Rowls, the author of that code, agreed to release it to the
	112	public domain.
	113	"""
	114
227bf1a3	115	def __init__(self, context=None, source_address=None, args, *kwargs):
	116	super().__init__(args, *kwargs)
	117	self._source_address = source_address
	118	self._context = context
c365dba8	119
227bf1a3	120	@staticmethod
	121	def _make_conn_class(base, req):
	122	conn_class = base
	123	socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
c365dba8	124	if socks_proxy:
c365dba8	125	conn_class = make_socks_conn_class(conn_class, socks_proxy)
227bf1a3	126	return conn_class
c365dba8	127
227bf1a3	128	def http_open(self, req):
227bf1a3	129	conn_class = self._make_conn_class(http.client.HTTPConnection, req)
c365dba8	130	return self.do_open(functools.partial(
227bf1a3	131	_create_http_connection, conn_class, self._source_address), req)
	132
	133	def https_open(self, req):
	134	conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
	135	return self.do_open(
	136	functools.partial(
	137	_create_http_connection, conn_class, self._source_address),
	138	req, context=self._context)
c365dba8	139
	140	@staticmethod
	141	def deflate(data):
	142	if not data:
	143	return data
	144	try:
	145	return zlib.decompress(data, -zlib.MAX_WBITS)
	146	except zlib.error:
	147	return zlib.decompress(data)
	148
	149	@staticmethod
	150	def brotli(data):
	151	if not data:
	152	return data
	153	return brotli.decompress(data)
	154
	155	@staticmethod
	156	def gz(data):
59e92b1f SS	157	# There may be junk added the end of the file
59e92b1f SS	158	# We ignore it by only ever decoding a single gzip payload
77bff23e SS	159	if not data:
77bff23e SS	160	return data
59e92b1f	161	return zlib.decompress(data, wbits=zlib.MAX_WBITS \| 16)
c365dba8	162
	163	def http_request(self, req):
	164	# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
	165	# always respected by websites, some tend to give out URLs with non percent-encoded
	166	# non-ASCII characters (see telemb.py, ard.py [#3412])
	167	# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
	168	# To work around aforementioned issue we will replace request's original URL with
	169	# percent-encoded one
	170	# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
	171	# the code of this workaround has been moved here from YoutubeDL.urlopen()
	172	url = req.get_full_url()
4bf91228	173	url_escaped = normalize_url(url)
c365dba8	174
	175	# Substitute URL if any change after escaping
	176	if url != url_escaped:
	177	req = update_Request(req, url=url_escaped)
	178
c365dba8	179	return super().do_request_(req)
	180
	181	def http_response(self, req, resp):
	182	old_resp = resp
	183
	184	# Content-Encoding header lists the encodings in order that they were applied [1].
	185	# To decompress, we simply do the reverse.
	186	# [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
	187	decoded_response = None
	188	for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
	189	if encoding == 'gzip':
	190	decoded_response = self.gz(decoded_response or resp.read())
	191	elif encoding == 'deflate':
	192	decoded_response = self.deflate(decoded_response or resp.read())
	193	elif encoding == 'br' and brotli:
	194	decoded_response = self.brotli(decoded_response or resp.read())
	195
	196	if decoded_response is not None:
	197	resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
	198	resp.msg = old_resp.msg
	199	# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
	200	# https://github.com/ytdl-org/youtube-dl/issues/6457).
	201	if 300 <= resp.code < 400:
	202	location = resp.headers.get('Location')
	203	if location:
	204	# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
	205	location = location.encode('iso-8859-1').decode()
4bf91228	206	location_escaped = normalize_url(location)
c365dba8	207	if location != location_escaped:
	208	del resp.headers['Location']
	209	resp.headers['Location'] = location_escaped
	210	return resp
	211
	212	https_request = http_request
	213	https_response = http_response
	214
	215
	216	def make_socks_conn_class(base_class, socks_proxy):
	217	assert issubclass(base_class, (
	218	http.client.HTTPConnection, http.client.HTTPSConnection))
	219
	220	proxy_args = make_socks_proxy_opts(socks_proxy)
	221
	222	class SocksConnection(base_class):
	223	def connect(self):
	224	self.sock = sockssocket()
	225	self.sock.setproxy(**proxy_args)
227bf1a3	226	if type(self.timeout) in (int, float): # noqa: E721
c365dba8	227	self.sock.settimeout(self.timeout)
	228	self.sock.connect((self.host, self.port))
	229
	230	if isinstance(self, http.client.HTTPSConnection):
227bf1a3	231	self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
c365dba8	232
	233	return SocksConnection
	234
	235
	236	class RedirectHandler(urllib.request.HTTPRedirectHandler):
	237	"""YoutubeDL redirect handler
	238
	239	The code is based on HTTPRedirectHandler implementation from CPython [1].
	240
	241	This redirect handler fixes and improves the logic to better align with RFC7261
	242	and what browsers tend to do [2][3]
	243
	244	1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
	245	2. https://datatracker.ietf.org/doc/html/rfc7231
	246	3. https://github.com/python/cpython/issues/91306
	247	"""
	248
	249	http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
	250
	251	def redirect_request(self, req, fp, code, msg, headers, newurl):
	252	if code not in (301, 302, 303, 307, 308):
	253	raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
	254
	255	new_data = req.data
	256
	257	# Technically the Cookie header should be in unredirected_hdrs,
	258	# however in practice some may set it in normal headers anyway.
	259	# We will remove it here to prevent any leaks.
	260	remove_headers = ['Cookie']
	261
	262	new_method = get_redirect_method(req.get_method(), code)
	263	# only remove payload if method changed (e.g. POST to GET)
	264	if new_method != req.get_method():
	265	new_data = None
	266	remove_headers.extend(['Content-Length', 'Content-Type'])
	267
	268	new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
	269
	270	return urllib.request.Request(
	271	newurl, headers=new_headers, origin_req_host=req.origin_req_host,
	272	unverifiable=True, method=new_method, data=new_data)
	273
	274
227bf1a3	275	class ProxyHandler(urllib.request.BaseHandler):
	276	handler_order = 100
	277
c365dba8	278	def __init__(self, proxies=None):
227bf1a3	279	self.proxies = proxies
c365dba8	280	# Set default handlers
227bf1a3	281	for type in ('http', 'https', 'ftp'):
	282	setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
	283
	284	def proxy_open(self, req):
	285	proxy = select_proxy(req.get_full_url(), self.proxies)
	286	if proxy is None:
	287	return
	288	if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
c365dba8	289	req.add_header('Ytdl-socks-proxy', proxy)
	290	# yt-dlp's http/https handlers do wrapping the socket with socks
	291	return None
	292	return urllib.request.ProxyHandler.proxy_open(
227bf1a3	293	self, req, proxy, None)
c365dba8	294
	295
	296	class PUTRequest(urllib.request.Request):
	297	def get_method(self):
	298	return 'PUT'
	299
	300
	301	class HEADRequest(urllib.request.Request):
	302	def get_method(self):
	303	return 'HEAD'
	304
	305
	306	def update_Request(req, url=None, data=None, headers=None, query=None):
	307	req_headers = req.headers.copy()
	308	req_headers.update(headers or {})
71baa490	309	req_data = data if data is not None else req.data
c365dba8	310	req_url = update_url_query(url or req.get_full_url(), query)
	311	req_get_method = req.get_method()
	312	if req_get_method == 'HEAD':
	313	req_type = HEADRequest
	314	elif req_get_method == 'PUT':
	315	req_type = PUTRequest
	316	else:
	317	req_type = urllib.request.Request
	318	new_req = req_type(
	319	req_url, data=req_data, headers=req_headers,
	320	origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
	321	if hasattr(req, 'timeout'):
	322	new_req.timeout = req.timeout
	323	return new_req
227bf1a3	324
	325
	326	class UrllibResponseAdapter(Response):
	327	"""
	328	HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
	329	"""
	330
	331	def __init__(self, res: http.client.HTTPResponse \| urllib.response.addinfourl):
	332	# addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
	333	# HTTPResponse: .getcode() was deprecated, .status always existed [2]
	334	# 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
	335	# 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
	336	super().__init__(
	337	fp=res, headers=res.headers, url=res.url,
	338	status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
	339
	340	def read(self, amt=None):
	341	try:
	342	return self.fp.read(amt)
	343	except Exception as e:
	344	handle_response_read_exceptions(e)
	345	raise e
	346
	347
	348	def handle_sslerror(e: ssl.SSLError):
	349	if not isinstance(e, ssl.SSLError):
	350	return
	351	if isinstance(e, ssl.SSLCertVerificationError):
	352	raise CertificateVerifyError(cause=e) from e
	353	raise SSLError(cause=e) from e
	354
	355
	356	def handle_response_read_exceptions(e):
	357	if isinstance(e, http.client.IncompleteRead):
	358	raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
	359	elif isinstance(e, ssl.SSLError):
	360	handle_sslerror(e)
	361	elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
	362	# OSErrors raised here should mostly be network related
	363	raise TransportError(cause=e) from e
	364
	365
62b5c94c	366	@register_rh
227bf1a3	367	class UrllibRH(RequestHandler, InstanceStoreMixin):
	368	_SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
	369	_SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
	370	_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
	371	RH_NAME = 'urllib'
	372
	373	def __init__(self, , enable_file_urls: bool = False, *kwargs):
	374	super().__init__(**kwargs)
	375	self.enable_file_urls = enable_file_urls
	376	if self.enable_file_urls:
	377	self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
	378
86aea0d3	379	def _check_extensions(self, extensions):
	380	super()._check_extensions(extensions)
	381	extensions.pop('cookiejar', None)
	382	extensions.pop('timeout', None)
	383
227bf1a3	384	def _create_instance(self, proxies, cookiejar):
	385	opener = urllib.request.OpenerDirector()
	386	handlers = [
	387	ProxyHandler(proxies),
	388	HTTPHandler(
	389	debuglevel=int(bool(self.verbose)),
	390	context=self._make_sslcontext(),
	391	source_address=self.source_address),
	392	HTTPCookieProcessor(cookiejar),
	393	DataHandler(),
	394	UnknownHandler(),
	395	HTTPDefaultErrorHandler(),
	396	FTPHandler(),
	397	HTTPErrorProcessor(),
	398	RedirectHandler(),
	399	]
	400
	401	if self.enable_file_urls:
	402	handlers.append(FileHandler())
	403
	404	for handler in handlers:
	405	opener.add_handler(handler)
	406
	407	# Delete the default user-agent header, which would otherwise apply in
	408	# cases where our custom HTTP handler doesn't come into play
	409	# (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
	410	opener.addheaders = []
	411	return opener
	412
	413	def _send(self, request):
	414	headers = self._merge_headers(request.headers)
	415	add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
	416	urllib_req = urllib.request.Request(
	417	url=request.url,
	418	data=request.data,
	419	headers=dict(headers),
	420	method=request.method
	421	)
	422
	423	opener = self._get_instance(
	424	proxies=request.proxies or self.proxies,
	425	cookiejar=request.extensions.get('cookiejar') or self.cookiejar
	426	)
	427	try:
	428	res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
	429	except urllib.error.HTTPError as e:
	430	if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
	431	# Prevent file object from being closed when urllib.error.HTTPError is destroyed.
836e06d2	432	e._closer.close_called = True
227bf1a3	433	raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
	434	raise # unexpected
	435	except urllib.error.URLError as e:
	436	cause = e.reason # NOTE: cause may be a string
	437
	438	# proxy errors
	439	if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
	440	raise ProxyError(cause=e) from e
	441
	442	handle_response_read_exceptions(cause)
	443	raise TransportError(cause=e) from e
	444	except (http.client.InvalidURL, ValueError) as e:
	445	# Validation errors
	446	# http.client.HTTPConnection raises ValueError in some validation cases
	447	# such as if request method contains illegal control characters [1]
	448	# 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
	449	raise RequestError(cause=e) from e
	450	except Exception as e:
	451	handle_response_read_exceptions(e)
	452	raise # unexpected
	453
	454	return UrllibResponseAdapter(res)