yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import http.client
   5 import io
   6 import socket
   7 import ssl
   8 import urllib.error
   9 import urllib.parse
  10 import urllib.request
  11 import urllib.response
  12 import zlib
  13 from urllib.request import (
  14     DataHandler,
  15     FileHandler,
  16     FTPHandler,
  17     HTTPCookieProcessor,
  18     HTTPDefaultErrorHandler,
  19     HTTPErrorProcessor,
  20     UnknownHandler,
  21 )
  22
  23 from ._helper import (
  24     InstanceStoreMixin,
  25     add_accept_encoding_header,
  26     get_redirect_method,
  27     make_socks_proxy_opts,
  28     select_proxy,
  29 )
  30 from .common import Features, RequestHandler, Response, register_rh
  31 from .exceptions import (
  32     CertificateVerifyError,
  33     HTTPError,
  34     IncompleteRead,
  35     ProxyError,
  36     RequestError,
  37     SSLError,
  38     TransportError,
  39 )
  40 from ..dependencies import brotli
  41 from ..socks import ProxyError as SocksProxyError
  42 from ..socks import sockssocket
  43 from ..utils import update_url_query
  44 from ..utils.networking import normalize_url
  45
  46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  48
  49 if brotli:
  50     SUPPORTED_ENCODINGS.append('br')
  51     CONTENT_DECODE_ERRORS.append(brotli.error)
  52
  53
  54 def _create_http_connection(http_class, source_address, *args, **kwargs):
  55     hc = http_class(*args, **kwargs)
  56
  57     if source_address is not None:
  58         # This is to workaround _create_connection() from socket where it will try all
  59         # address data from getaddrinfo() including IPv6. This filters the result from
  60         # getaddrinfo() based on the source_address value.
  61         # This is based on the cpython socket.create_connection() function.
  62         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  63         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  64             host, port = address
  65             err = None
  66             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  67             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  68             ip_addrs = [addr for addr in addrs if addr[0] == af]
  69             if addrs and not ip_addrs:
  70                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
  71                 raise OSError(
  72                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
  73                     % (ip_version, source_address[0]))
  74             for res in ip_addrs:
  75                 af, socktype, proto, canonname, sa = res
  76                 sock = None
  77                 try:
  78                     sock = socket.socket(af, socktype, proto)
  79                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  80                         sock.settimeout(timeout)
  81                     sock.bind(source_address)
  82                     sock.connect(sa)
  83                     err = None  # Explicitly break reference cycle
  84                     return sock
  85                 except OSError as _:
  86                     err = _
  87                     if sock is not None:
  88                         sock.close()
  89             if err is not None:
  90                 raise err
  91             else:
  92                 raise OSError('getaddrinfo returns an empty list')
  93         if hasattr(hc, '_create_connection'):
  94             hc._create_connection = _create_connection
  95         hc.source_address = (source_address, 0)
  96
  97     return hc
  98
  99
 100 class HTTPHandler(urllib.request.AbstractHTTPHandler):
 101     """Handler for HTTP requests and responses.
 102
 103     This class, when installed with an OpenerDirector, automatically adds
 104     the standard headers to every HTTP request and handles gzipped, deflated and
 105     brotli responses from web servers.
 106
 107     Part of this code was copied from:
 108
 109     http://techknack.net/python-urllib2-handlers/
 110
 111     Andrew Rowls, the author of that code, agreed to release it to the
 112     public domain.
 113     """
 114
 115     def __init__(self, context=None, source_address=None, *args, **kwargs):
 116         super().__init__(*args, **kwargs)
 117         self._source_address = source_address
 118         self._context = context
 119
 120     @staticmethod
 121     def _make_conn_class(base, req):
 122         conn_class = base
 123         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
 124         if socks_proxy:
 125             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 126         return conn_class
 127
 128     def http_open(self, req):
 129         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
 130         return self.do_open(functools.partial(
 131             _create_http_connection, conn_class, self._source_address), req)
 132
 133     def https_open(self, req):
 134         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 135         return self.do_open(
 136             functools.partial(
 137                 _create_http_connection, conn_class, self._source_address),
 138             req, context=self._context)
 139
 140     @staticmethod
 141     def deflate(data):
 142         if not data:
 143             return data
 144         try:
 145             return zlib.decompress(data, -zlib.MAX_WBITS)
 146         except zlib.error:
 147             return zlib.decompress(data)
 148
 149     @staticmethod
 150     def brotli(data):
 151         if not data:
 152             return data
 153         return brotli.decompress(data)
 154
 155     @staticmethod
 156     def gz(data):
 157         # There may be junk added the end of the file
 158         # We ignore it by only ever decoding a single gzip payload
 159         return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
 160
 161     def http_request(self, req):
 162         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 163         # always respected by websites, some tend to give out URLs with non percent-encoded
 164         # non-ASCII characters (see telemb.py, ard.py [#3412])
 165         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 166         # To work around aforementioned issue we will replace request's original URL with
 167         # percent-encoded one
 168         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 169         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 170         url = req.get_full_url()
 171         url_escaped = normalize_url(url)
 172
 173         # Substitute URL if any change after escaping
 174         if url != url_escaped:
 175             req = update_Request(req, url=url_escaped)
 176
 177         return super().do_request_(req)
 178
 179     def http_response(self, req, resp):
 180         old_resp = resp
 181
 182         # Content-Encoding header lists the encodings in order that they were applied [1].
 183         # To decompress, we simply do the reverse.
 184         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 185         decoded_response = None
 186         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 187             if encoding == 'gzip':
 188                 decoded_response = self.gz(decoded_response or resp.read())
 189             elif encoding == 'deflate':
 190                 decoded_response = self.deflate(decoded_response or resp.read())
 191             elif encoding == 'br' and brotli:
 192                 decoded_response = self.brotli(decoded_response or resp.read())
 193
 194         if decoded_response is not None:
 195             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 196             resp.msg = old_resp.msg
 197         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 198         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 199         if 300 <= resp.code < 400:
 200             location = resp.headers.get('Location')
 201             if location:
 202                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 203                 location = location.encode('iso-8859-1').decode()
 204                 location_escaped = normalize_url(location)
 205                 if location != location_escaped:
 206                     del resp.headers['Location']
 207                     resp.headers['Location'] = location_escaped
 208         return resp
 209
 210     https_request = http_request
 211     https_response = http_response
 212
 213
 214 def make_socks_conn_class(base_class, socks_proxy):
 215     assert issubclass(base_class, (
 216         http.client.HTTPConnection, http.client.HTTPSConnection))
 217
 218     proxy_args = make_socks_proxy_opts(socks_proxy)
 219
 220     class SocksConnection(base_class):
 221         def connect(self):
 222             self.sock = sockssocket()
 223             self.sock.setproxy(**proxy_args)
 224             if type(self.timeout) in (int, float):  # noqa: E721
 225                 self.sock.settimeout(self.timeout)
 226             self.sock.connect((self.host, self.port))
 227
 228             if isinstance(self, http.client.HTTPSConnection):
 229                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 230
 231     return SocksConnection
 232
 233
 234 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 235     """YoutubeDL redirect handler
 236
 237     The code is based on HTTPRedirectHandler implementation from CPython [1].
 238
 239     This redirect handler fixes and improves the logic to better align with RFC7261
 240      and what browsers tend to do [2][3]
 241
 242     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 243     2. https://datatracker.ietf.org/doc/html/rfc7231
 244     3. https://github.com/python/cpython/issues/91306
 245     """
 246
 247     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 248
 249     def redirect_request(self, req, fp, code, msg, headers, newurl):
 250         if code not in (301, 302, 303, 307, 308):
 251             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 252
 253         new_data = req.data
 254
 255         # Technically the Cookie header should be in unredirected_hdrs,
 256         # however in practice some may set it in normal headers anyway.
 257         # We will remove it here to prevent any leaks.
 258         remove_headers = ['Cookie']
 259
 260         new_method = get_redirect_method(req.get_method(), code)
 261         # only remove payload if method changed (e.g. POST to GET)
 262         if new_method != req.get_method():
 263             new_data = None
 264             remove_headers.extend(['Content-Length', 'Content-Type'])
 265
 266         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 267
 268         return urllib.request.Request(
 269             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 270             unverifiable=True, method=new_method, data=new_data)
 271
 272
 273 class ProxyHandler(urllib.request.BaseHandler):
 274     handler_order = 100
 275
 276     def __init__(self, proxies=None):
 277         self.proxies = proxies
 278         # Set default handlers
 279         for type in ('http', 'https', 'ftp'):
 280             setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
 281
 282     def proxy_open(self, req):
 283         proxy = select_proxy(req.get_full_url(), self.proxies)
 284         if proxy is None:
 285             return
 286         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 287             req.add_header('Ytdl-socks-proxy', proxy)
 288             # yt-dlp's http/https handlers do wrapping the socket with socks
 289             return None
 290         return urllib.request.ProxyHandler.proxy_open(
 291             self, req, proxy, None)
 292
 293
 294 class PUTRequest(urllib.request.Request):
 295     def get_method(self):
 296         return 'PUT'
 297
 298
 299 class HEADRequest(urllib.request.Request):
 300     def get_method(self):
 301         return 'HEAD'
 302
 303
 304 def update_Request(req, url=None, data=None, headers=None, query=None):
 305     req_headers = req.headers.copy()
 306     req_headers.update(headers or {})
 307     req_data = data if data is not None else req.data
 308     req_url = update_url_query(url or req.get_full_url(), query)
 309     req_get_method = req.get_method()
 310     if req_get_method == 'HEAD':
 311         req_type = HEADRequest
 312     elif req_get_method == 'PUT':
 313         req_type = PUTRequest
 314     else:
 315         req_type = urllib.request.Request
 316     new_req = req_type(
 317         req_url, data=req_data, headers=req_headers,
 318         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 319     if hasattr(req, 'timeout'):
 320         new_req.timeout = req.timeout
 321     return new_req
 322
 323
 324 class UrllibResponseAdapter(Response):
 325     """
 326     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 327     """
 328
 329     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 330         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 331         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 332         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 333         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 334         super().__init__(
 335             fp=res, headers=res.headers, url=res.url,
 336             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 337
 338     def read(self, amt=None):
 339         try:
 340             return self.fp.read(amt)
 341         except Exception as e:
 342             handle_response_read_exceptions(e)
 343             raise e
 344
 345
 346 def handle_sslerror(e: ssl.SSLError):
 347     if not isinstance(e, ssl.SSLError):
 348         return
 349     if isinstance(e, ssl.SSLCertVerificationError):
 350         raise CertificateVerifyError(cause=e) from e
 351     raise SSLError(cause=e) from e
 352
 353
 354 def handle_response_read_exceptions(e):
 355     if isinstance(e, http.client.IncompleteRead):
 356         raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
 357     elif isinstance(e, ssl.SSLError):
 358         handle_sslerror(e)
 359     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 360         # OSErrors raised here should mostly be network related
 361         raise TransportError(cause=e) from e
 362
 363
 364 @register_rh
 365 class UrllibRH(RequestHandler, InstanceStoreMixin):
 366     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 367     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 368     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 369     RH_NAME = 'urllib'
 370
 371     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 372         super().__init__(**kwargs)
 373         self.enable_file_urls = enable_file_urls
 374         if self.enable_file_urls:
 375             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 376
 377     def _check_extensions(self, extensions):
 378         super()._check_extensions(extensions)
 379         extensions.pop('cookiejar', None)
 380         extensions.pop('timeout', None)
 381
 382     def _create_instance(self, proxies, cookiejar):
 383         opener = urllib.request.OpenerDirector()
 384         handlers = [
 385             ProxyHandler(proxies),
 386             HTTPHandler(
 387                 debuglevel=int(bool(self.verbose)),
 388                 context=self._make_sslcontext(),
 389                 source_address=self.source_address),
 390             HTTPCookieProcessor(cookiejar),
 391             DataHandler(),
 392             UnknownHandler(),
 393             HTTPDefaultErrorHandler(),
 394             FTPHandler(),
 395             HTTPErrorProcessor(),
 396             RedirectHandler(),
 397         ]
 398
 399         if self.enable_file_urls:
 400             handlers.append(FileHandler())
 401
 402         for handler in handlers:
 403             opener.add_handler(handler)
 404
 405         # Delete the default user-agent header, which would otherwise apply in
 406         # cases where our custom HTTP handler doesn't come into play
 407         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 408         opener.addheaders = []
 409         return opener
 410
 411     def _send(self, request):
 412         headers = self._merge_headers(request.headers)
 413         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 414         urllib_req = urllib.request.Request(
 415             url=request.url,
 416             data=request.data,
 417             headers=dict(headers),
 418             method=request.method
 419         )
 420
 421         opener = self._get_instance(
 422             proxies=request.proxies or self.proxies,
 423             cookiejar=request.extensions.get('cookiejar') or self.cookiejar
 424         )
 425         try:
 426             res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
 427         except urllib.error.HTTPError as e:
 428             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 429                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 430                 e._closer.file = None
 431                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 432             raise  # unexpected
 433         except urllib.error.URLError as e:
 434             cause = e.reason  # NOTE: cause may be a string
 435
 436             # proxy errors
 437             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 438                 raise ProxyError(cause=e) from e
 439
 440             handle_response_read_exceptions(cause)
 441             raise TransportError(cause=e) from e
 442         except (http.client.InvalidURL, ValueError) as e:
 443             # Validation errors
 444             # http.client.HTTPConnection raises ValueError in some validation cases
 445             # such as if request method contains illegal control characters [1]
 446             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 447             raise RequestError(cause=e) from e
 448         except Exception as e:
 449             handle_response_read_exceptions(e)
 450             raise  # unexpected
 451
 452         return UrllibResponseAdapter(res)