yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import http.client
   5 import io
   6 import socket
   7 import ssl
   8 import urllib.error
   9 import urllib.parse
  10 import urllib.request
  11 import urllib.response
  12 import zlib
  13 from urllib.request import (
  14     DataHandler,
  15     FileHandler,
  16     FTPHandler,
  17     HTTPCookieProcessor,
  18     HTTPDefaultErrorHandler,
  19     HTTPErrorProcessor,
  20     UnknownHandler,
  21 )
  22
  23 from ._helper import (
  24     InstanceStoreMixin,
  25     add_accept_encoding_header,
  26     create_connection,
  27     get_redirect_method,
  28     make_socks_proxy_opts,
  29     select_proxy,
  30 )
  31 from .common import Features, RequestHandler, Response, register_rh
  32 from .exceptions import (
  33     CertificateVerifyError,
  34     HTTPError,
  35     IncompleteRead,
  36     ProxyError,
  37     RequestError,
  38     SSLError,
  39     TransportError,
  40 )
  41 from ..dependencies import brotli
  42 from ..socks import ProxyError as SocksProxyError
  43 from ..socks import sockssocket
  44 from ..utils import update_url_query
  45 from ..utils.networking import normalize_url
  46
  47 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  48 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  49
  50 if brotli:
  51     SUPPORTED_ENCODINGS.append('br')
  52     CONTENT_DECODE_ERRORS.append(brotli.error)
  53
  54
  55 def _create_http_connection(http_class, source_address, *args, **kwargs):
  56     hc = http_class(*args, **kwargs)
  57
  58     if hasattr(hc, '_create_connection'):
  59         hc._create_connection = create_connection
  60
  61     if source_address is not None:
  62         hc.source_address = (source_address, 0)
  63
  64     return hc
  65
  66
  67 class HTTPHandler(urllib.request.AbstractHTTPHandler):
  68     """Handler for HTTP requests and responses.
  69
  70     This class, when installed with an OpenerDirector, automatically adds
  71     the standard headers to every HTTP request and handles gzipped, deflated and
  72     brotli responses from web servers.
  73
  74     Part of this code was copied from:
  75
  76     http://techknack.net/python-urllib2-handlers/
  77
  78     Andrew Rowls, the author of that code, agreed to release it to the
  79     public domain.
  80     """
  81
  82     def __init__(self, context=None, source_address=None, *args, **kwargs):
  83         super().__init__(*args, **kwargs)
  84         self._source_address = source_address
  85         self._context = context
  86
  87     @staticmethod
  88     def _make_conn_class(base, req):
  89         conn_class = base
  90         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
  91         if socks_proxy:
  92             conn_class = make_socks_conn_class(conn_class, socks_proxy)
  93         return conn_class
  94
  95     def http_open(self, req):
  96         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
  97         return self.do_open(functools.partial(
  98             _create_http_connection, conn_class, self._source_address), req)
  99
 100     def https_open(self, req):
 101         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 102         return self.do_open(
 103             functools.partial(
 104                 _create_http_connection, conn_class, self._source_address),
 105             req, context=self._context)
 106
 107     @staticmethod
 108     def deflate(data):
 109         if not data:
 110             return data
 111         try:
 112             return zlib.decompress(data, -zlib.MAX_WBITS)
 113         except zlib.error:
 114             return zlib.decompress(data)
 115
 116     @staticmethod
 117     def brotli(data):
 118         if not data:
 119             return data
 120         return brotli.decompress(data)
 121
 122     @staticmethod
 123     def gz(data):
 124         # There may be junk added the end of the file
 125         # We ignore it by only ever decoding a single gzip payload
 126         if not data:
 127             return data
 128         return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
 129
 130     def http_request(self, req):
 131         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 132         # always respected by websites, some tend to give out URLs with non percent-encoded
 133         # non-ASCII characters (see telemb.py, ard.py [#3412])
 134         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 135         # To work around aforementioned issue we will replace request's original URL with
 136         # percent-encoded one
 137         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 138         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 139         url = req.get_full_url()
 140         url_escaped = normalize_url(url)
 141
 142         # Substitute URL if any change after escaping
 143         if url != url_escaped:
 144             req = update_Request(req, url=url_escaped)
 145
 146         return super().do_request_(req)
 147
 148     def http_response(self, req, resp):
 149         old_resp = resp
 150
 151         # Content-Encoding header lists the encodings in order that they were applied [1].
 152         # To decompress, we simply do the reverse.
 153         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 154         decoded_response = None
 155         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 156             if encoding == 'gzip':
 157                 decoded_response = self.gz(decoded_response or resp.read())
 158             elif encoding == 'deflate':
 159                 decoded_response = self.deflate(decoded_response or resp.read())
 160             elif encoding == 'br' and brotli:
 161                 decoded_response = self.brotli(decoded_response or resp.read())
 162
 163         if decoded_response is not None:
 164             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 165             resp.msg = old_resp.msg
 166         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 167         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 168         if 300 <= resp.code < 400:
 169             location = resp.headers.get('Location')
 170             if location:
 171                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 172                 location = location.encode('iso-8859-1').decode()
 173                 location_escaped = normalize_url(location)
 174                 if location != location_escaped:
 175                     del resp.headers['Location']
 176                     resp.headers['Location'] = location_escaped
 177         return resp
 178
 179     https_request = http_request
 180     https_response = http_response
 181
 182
 183 def make_socks_conn_class(base_class, socks_proxy):
 184     assert issubclass(base_class, (
 185         http.client.HTTPConnection, http.client.HTTPSConnection))
 186
 187     proxy_args = make_socks_proxy_opts(socks_proxy)
 188
 189     class SocksConnection(base_class):
 190         _create_connection = create_connection
 191
 192         def connect(self):
 193             def sock_socket_connect(ip_addr, timeout, source_address):
 194                 af, socktype, proto, canonname, sa = ip_addr
 195                 sock = sockssocket(af, socktype, proto)
 196                 try:
 197                     connect_proxy_args = proxy_args.copy()
 198                     connect_proxy_args.update({'addr': sa[0], 'port': sa[1]})
 199                     sock.setproxy(**connect_proxy_args)
 200                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:  # noqa: E721
 201                         sock.settimeout(timeout)
 202                     if source_address:
 203                         sock.bind(source_address)
 204                     sock.connect((self.host, self.port))
 205                     return sock
 206                 except socket.error:
 207                     sock.close()
 208                     raise
 209             self.sock = create_connection(
 210                 (proxy_args['addr'], proxy_args['port']), timeout=self.timeout,
 211                 source_address=self.source_address, _create_socket_func=sock_socket_connect)
 212             if isinstance(self, http.client.HTTPSConnection):
 213                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 214
 215     return SocksConnection
 216
 217
 218 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 219     """YoutubeDL redirect handler
 220
 221     The code is based on HTTPRedirectHandler implementation from CPython [1].
 222
 223     This redirect handler fixes and improves the logic to better align with RFC7261
 224      and what browsers tend to do [2][3]
 225
 226     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 227     2. https://datatracker.ietf.org/doc/html/rfc7231
 228     3. https://github.com/python/cpython/issues/91306
 229     """
 230
 231     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 232
 233     def redirect_request(self, req, fp, code, msg, headers, newurl):
 234         if code not in (301, 302, 303, 307, 308):
 235             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 236
 237         new_data = req.data
 238
 239         # Technically the Cookie header should be in unredirected_hdrs,
 240         # however in practice some may set it in normal headers anyway.
 241         # We will remove it here to prevent any leaks.
 242         remove_headers = ['Cookie']
 243
 244         new_method = get_redirect_method(req.get_method(), code)
 245         # only remove payload if method changed (e.g. POST to GET)
 246         if new_method != req.get_method():
 247             new_data = None
 248             remove_headers.extend(['Content-Length', 'Content-Type'])
 249
 250         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 251
 252         return urllib.request.Request(
 253             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 254             unverifiable=True, method=new_method, data=new_data)
 255
 256
 257 class ProxyHandler(urllib.request.BaseHandler):
 258     handler_order = 100
 259
 260     def __init__(self, proxies=None):
 261         self.proxies = proxies
 262         # Set default handlers
 263         for type in ('http', 'https', 'ftp'):
 264             setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
 265
 266     def proxy_open(self, req):
 267         proxy = select_proxy(req.get_full_url(), self.proxies)
 268         if proxy is None:
 269             return
 270         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 271             req.add_header('Ytdl-socks-proxy', proxy)
 272             # yt-dlp's http/https handlers do wrapping the socket with socks
 273             return None
 274         return urllib.request.ProxyHandler.proxy_open(
 275             self, req, proxy, None)
 276
 277
 278 class PUTRequest(urllib.request.Request):
 279     def get_method(self):
 280         return 'PUT'
 281
 282
 283 class HEADRequest(urllib.request.Request):
 284     def get_method(self):
 285         return 'HEAD'
 286
 287
 288 def update_Request(req, url=None, data=None, headers=None, query=None):
 289     req_headers = req.headers.copy()
 290     req_headers.update(headers or {})
 291     req_data = data if data is not None else req.data
 292     req_url = update_url_query(url or req.get_full_url(), query)
 293     req_get_method = req.get_method()
 294     if req_get_method == 'HEAD':
 295         req_type = HEADRequest
 296     elif req_get_method == 'PUT':
 297         req_type = PUTRequest
 298     else:
 299         req_type = urllib.request.Request
 300     new_req = req_type(
 301         req_url, data=req_data, headers=req_headers,
 302         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 303     if hasattr(req, 'timeout'):
 304         new_req.timeout = req.timeout
 305     return new_req
 306
 307
 308 class UrllibResponseAdapter(Response):
 309     """
 310     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 311     """
 312
 313     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 314         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 315         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 316         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 317         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 318         super().__init__(
 319             fp=res, headers=res.headers, url=res.url,
 320             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 321
 322     def read(self, amt=None):
 323         try:
 324             return self.fp.read(amt)
 325         except Exception as e:
 326             handle_response_read_exceptions(e)
 327             raise e
 328
 329
 330 def handle_sslerror(e: ssl.SSLError):
 331     if not isinstance(e, ssl.SSLError):
 332         return
 333     if isinstance(e, ssl.SSLCertVerificationError):
 334         raise CertificateVerifyError(cause=e) from e
 335     raise SSLError(cause=e) from e
 336
 337
 338 def handle_response_read_exceptions(e):
 339     if isinstance(e, http.client.IncompleteRead):
 340         raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
 341     elif isinstance(e, ssl.SSLError):
 342         handle_sslerror(e)
 343     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 344         # OSErrors raised here should mostly be network related
 345         raise TransportError(cause=e) from e
 346
 347
 348 @register_rh
 349 class UrllibRH(RequestHandler, InstanceStoreMixin):
 350     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 351     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 352     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 353     RH_NAME = 'urllib'
 354
 355     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 356         super().__init__(**kwargs)
 357         self.enable_file_urls = enable_file_urls
 358         if self.enable_file_urls:
 359             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 360
 361     def _check_extensions(self, extensions):
 362         super()._check_extensions(extensions)
 363         extensions.pop('cookiejar', None)
 364         extensions.pop('timeout', None)
 365
 366     def _create_instance(self, proxies, cookiejar):
 367         opener = urllib.request.OpenerDirector()
 368         handlers = [
 369             ProxyHandler(proxies),
 370             HTTPHandler(
 371                 debuglevel=int(bool(self.verbose)),
 372                 context=self._make_sslcontext(),
 373                 source_address=self.source_address),
 374             HTTPCookieProcessor(cookiejar),
 375             DataHandler(),
 376             UnknownHandler(),
 377             HTTPDefaultErrorHandler(),
 378             FTPHandler(),
 379             HTTPErrorProcessor(),
 380             RedirectHandler(),
 381         ]
 382
 383         if self.enable_file_urls:
 384             handlers.append(FileHandler())
 385
 386         for handler in handlers:
 387             opener.add_handler(handler)
 388
 389         # Delete the default user-agent header, which would otherwise apply in
 390         # cases where our custom HTTP handler doesn't come into play
 391         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 392         opener.addheaders = []
 393         return opener
 394
 395     def _send(self, request):
 396         headers = self._merge_headers(request.headers)
 397         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 398         urllib_req = urllib.request.Request(
 399             url=request.url,
 400             data=request.data,
 401             headers=dict(headers),
 402             method=request.method
 403         )
 404
 405         opener = self._get_instance(
 406             proxies=request.proxies or self.proxies,
 407             cookiejar=request.extensions.get('cookiejar') or self.cookiejar
 408         )
 409         try:
 410             res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
 411         except urllib.error.HTTPError as e:
 412             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 413                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 414                 e._closer.close_called = True
 415                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 416             raise  # unexpected
 417         except urllib.error.URLError as e:
 418             cause = e.reason  # NOTE: cause may be a string
 419
 420             # proxy errors
 421             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 422                 raise ProxyError(cause=e) from e
 423
 424             handle_response_read_exceptions(cause)
 425             raise TransportError(cause=e) from e
 426         except (http.client.InvalidURL, ValueError) as e:
 427             # Validation errors
 428             # http.client.HTTPConnection raises ValueError in some validation cases
 429             # such as if request method contains illegal control characters [1]
 430             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 431             raise RequestError(cause=e) from e
 432         except Exception as e:
 433             handle_response_read_exceptions(e)
 434             raise  # unexpected
 435
 436         return UrllibResponseAdapter(res)