yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import http.client
   5 import io
   6 import ssl
   7 import urllib.error
   8 import urllib.parse
   9 import urllib.request
  10 import urllib.response
  11 import zlib
  12 from urllib.request import (
  13     DataHandler,
  14     FileHandler,
  15     FTPHandler,
  16     HTTPCookieProcessor,
  17     HTTPDefaultErrorHandler,
  18     HTTPErrorProcessor,
  19     UnknownHandler,
  20 )
  21
  22 from ._helper import (
  23     InstanceStoreMixin,
  24     add_accept_encoding_header,
  25     create_connection,
  26     create_socks_proxy_socket,
  27     get_redirect_method,
  28     make_socks_proxy_opts,
  29     select_proxy,
  30 )
  31 from .common import Features, RequestHandler, Response, register_rh
  32 from .exceptions import (
  33     CertificateVerifyError,
  34     HTTPError,
  35     IncompleteRead,
  36     ProxyError,
  37     RequestError,
  38     SSLError,
  39     TransportError,
  40 )
  41 from ..dependencies import brotli
  42 from ..socks import ProxyError as SocksProxyError
  43 from ..utils import update_url_query
  44 from ..utils.networking import normalize_url
  45
  46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  48
  49 if brotli:
  50     SUPPORTED_ENCODINGS.append('br')
  51     CONTENT_DECODE_ERRORS.append(brotli.error)
  52
  53
  54 def _create_http_connection(http_class, source_address, *args, **kwargs):
  55     hc = http_class(*args, **kwargs)
  56
  57     if hasattr(hc, '_create_connection'):
  58         hc._create_connection = create_connection
  59
  60     if source_address is not None:
  61         hc.source_address = (source_address, 0)
  62
  63     return hc
  64
  65
  66 class HTTPHandler(urllib.request.AbstractHTTPHandler):
  67     """Handler for HTTP requests and responses.
  68
  69     This class, when installed with an OpenerDirector, automatically adds
  70     the standard headers to every HTTP request and handles gzipped, deflated and
  71     brotli responses from web servers.
  72
  73     Part of this code was copied from:
  74
  75     http://techknack.net/python-urllib2-handlers/
  76
  77     Andrew Rowls, the author of that code, agreed to release it to the
  78     public domain.
  79     """
  80
  81     def __init__(self, context=None, source_address=None, *args, **kwargs):
  82         super().__init__(*args, **kwargs)
  83         self._source_address = source_address
  84         self._context = context
  85
  86     @staticmethod
  87     def _make_conn_class(base, req):
  88         conn_class = base
  89         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
  90         if socks_proxy:
  91             conn_class = make_socks_conn_class(conn_class, socks_proxy)
  92         return conn_class
  93
  94     def http_open(self, req):
  95         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
  96         return self.do_open(functools.partial(
  97             _create_http_connection, conn_class, self._source_address), req)
  98
  99     def https_open(self, req):
 100         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 101         return self.do_open(
 102             functools.partial(
 103                 _create_http_connection, conn_class, self._source_address),
 104             req, context=self._context)
 105
 106     @staticmethod
 107     def deflate(data):
 108         if not data:
 109             return data
 110         try:
 111             return zlib.decompress(data, -zlib.MAX_WBITS)
 112         except zlib.error:
 113             return zlib.decompress(data)
 114
 115     @staticmethod
 116     def brotli(data):
 117         if not data:
 118             return data
 119         return brotli.decompress(data)
 120
 121     @staticmethod
 122     def gz(data):
 123         # There may be junk added the end of the file
 124         # We ignore it by only ever decoding a single gzip payload
 125         if not data:
 126             return data
 127         return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
 128
 129     def http_request(self, req):
 130         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 131         # always respected by websites, some tend to give out URLs with non percent-encoded
 132         # non-ASCII characters (see telemb.py, ard.py [#3412])
 133         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 134         # To work around aforementioned issue we will replace request's original URL with
 135         # percent-encoded one
 136         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 137         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 138         url = req.get_full_url()
 139         url_escaped = normalize_url(url)
 140
 141         # Substitute URL if any change after escaping
 142         if url != url_escaped:
 143             req = update_Request(req, url=url_escaped)
 144
 145         return super().do_request_(req)
 146
 147     def http_response(self, req, resp):
 148         old_resp = resp
 149
 150         # Content-Encoding header lists the encodings in order that they were applied [1].
 151         # To decompress, we simply do the reverse.
 152         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 153         decoded_response = None
 154         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 155             if encoding == 'gzip':
 156                 decoded_response = self.gz(decoded_response or resp.read())
 157             elif encoding == 'deflate':
 158                 decoded_response = self.deflate(decoded_response or resp.read())
 159             elif encoding == 'br' and brotli:
 160                 decoded_response = self.brotli(decoded_response or resp.read())
 161
 162         if decoded_response is not None:
 163             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 164             resp.msg = old_resp.msg
 165         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 166         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 167         if 300 <= resp.code < 400:
 168             location = resp.headers.get('Location')
 169             if location:
 170                 # As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
 171                 location = location.encode('iso-8859-1').decode()
 172                 location_escaped = normalize_url(location)
 173                 if location != location_escaped:
 174                     del resp.headers['Location']
 175                     resp.headers['Location'] = location_escaped
 176         return resp
 177
 178     https_request = http_request
 179     https_response = http_response
 180
 181
 182 def make_socks_conn_class(base_class, socks_proxy):
 183     assert issubclass(base_class, (
 184         http.client.HTTPConnection, http.client.HTTPSConnection))
 185
 186     proxy_args = make_socks_proxy_opts(socks_proxy)
 187
 188     class SocksConnection(base_class):
 189         _create_connection = create_connection
 190
 191         def connect(self):
 192             self.sock = create_connection(
 193                 (proxy_args['addr'], proxy_args['port']),
 194                 timeout=self.timeout,
 195                 source_address=self.source_address,
 196                 _create_socket_func=functools.partial(
 197                     create_socks_proxy_socket, (self.host, self.port), proxy_args))
 198             if isinstance(self, http.client.HTTPSConnection):
 199                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 200
 201     return SocksConnection
 202
 203
 204 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 205     """YoutubeDL redirect handler
 206
 207     The code is based on HTTPRedirectHandler implementation from CPython [1].
 208
 209     This redirect handler fixes and improves the logic to better align with RFC7261
 210      and what browsers tend to do [2][3]
 211
 212     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 213     2. https://datatracker.ietf.org/doc/html/rfc7231
 214     3. https://github.com/python/cpython/issues/91306
 215     """
 216
 217     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 218
 219     def redirect_request(self, req, fp, code, msg, headers, newurl):
 220         if code not in (301, 302, 303, 307, 308):
 221             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 222
 223         new_data = req.data
 224
 225         # Technically the Cookie header should be in unredirected_hdrs,
 226         # however in practice some may set it in normal headers anyway.
 227         # We will remove it here to prevent any leaks.
 228         remove_headers = ['Cookie']
 229
 230         new_method = get_redirect_method(req.get_method(), code)
 231         # only remove payload if method changed (e.g. POST to GET)
 232         if new_method != req.get_method():
 233             new_data = None
 234             remove_headers.extend(['Content-Length', 'Content-Type'])
 235
 236         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 237
 238         return urllib.request.Request(
 239             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 240             unverifiable=True, method=new_method, data=new_data)
 241
 242
 243 class ProxyHandler(urllib.request.BaseHandler):
 244     handler_order = 100
 245
 246     def __init__(self, proxies=None):
 247         self.proxies = proxies
 248         # Set default handlers
 249         for scheme in ('http', 'https', 'ftp'):
 250             setattr(self, f'{scheme}_open', lambda r, meth=self.proxy_open: meth(r))
 251
 252     def proxy_open(self, req):
 253         proxy = select_proxy(req.get_full_url(), self.proxies)
 254         if proxy is None:
 255             return
 256         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 257             req.add_header('Ytdl-socks-proxy', proxy)
 258             # yt-dlp's http/https handlers do wrapping the socket with socks
 259             return None
 260         return urllib.request.ProxyHandler.proxy_open(
 261             self, req, proxy, None)
 262
 263
 264 class PUTRequest(urllib.request.Request):
 265     def get_method(self):
 266         return 'PUT'
 267
 268
 269 class HEADRequest(urllib.request.Request):
 270     def get_method(self):
 271         return 'HEAD'
 272
 273
 274 def update_Request(req, url=None, data=None, headers=None, query=None):
 275     req_headers = req.headers.copy()
 276     req_headers.update(headers or {})
 277     req_data = data if data is not None else req.data
 278     req_url = update_url_query(url or req.get_full_url(), query)
 279     req_get_method = req.get_method()
 280     if req_get_method == 'HEAD':
 281         req_type = HEADRequest
 282     elif req_get_method == 'PUT':
 283         req_type = PUTRequest
 284     else:
 285         req_type = urllib.request.Request
 286     new_req = req_type(
 287         req_url, data=req_data, headers=req_headers,
 288         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 289     if hasattr(req, 'timeout'):
 290         new_req.timeout = req.timeout
 291     return new_req
 292
 293
 294 class UrllibResponseAdapter(Response):
 295     """
 296     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 297     """
 298
 299     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 300         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 301         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 302         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 303         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 304         super().__init__(
 305             fp=res, headers=res.headers, url=res.url,
 306             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 307
 308     def read(self, amt=None):
 309         try:
 310             return self.fp.read(amt)
 311         except Exception as e:
 312             handle_response_read_exceptions(e)
 313             raise e
 314
 315
 316 def handle_sslerror(e: ssl.SSLError):
 317     if not isinstance(e, ssl.SSLError):
 318         return
 319     if isinstance(e, ssl.SSLCertVerificationError):
 320         raise CertificateVerifyError(cause=e) from e
 321     raise SSLError(cause=e) from e
 322
 323
 324 def handle_response_read_exceptions(e):
 325     if isinstance(e, http.client.IncompleteRead):
 326         raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
 327     elif isinstance(e, ssl.SSLError):
 328         handle_sslerror(e)
 329     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 330         # OSErrors raised here should mostly be network related
 331         raise TransportError(cause=e) from e
 332
 333
 334 @register_rh
 335 class UrllibRH(RequestHandler, InstanceStoreMixin):
 336     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 337     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 338     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 339     RH_NAME = 'urllib'
 340
 341     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 342         super().__init__(**kwargs)
 343         self.enable_file_urls = enable_file_urls
 344         if self.enable_file_urls:
 345             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 346
 347     def _check_extensions(self, extensions):
 348         super()._check_extensions(extensions)
 349         extensions.pop('cookiejar', None)
 350         extensions.pop('timeout', None)
 351
 352     def _create_instance(self, proxies, cookiejar):
 353         opener = urllib.request.OpenerDirector()
 354         handlers = [
 355             ProxyHandler(proxies),
 356             HTTPHandler(
 357                 debuglevel=int(bool(self.verbose)),
 358                 context=self._make_sslcontext(),
 359                 source_address=self.source_address),
 360             HTTPCookieProcessor(cookiejar),
 361             DataHandler(),
 362             UnknownHandler(),
 363             HTTPDefaultErrorHandler(),
 364             FTPHandler(),
 365             HTTPErrorProcessor(),
 366             RedirectHandler(),
 367         ]
 368
 369         if self.enable_file_urls:
 370             handlers.append(FileHandler())
 371
 372         for handler in handlers:
 373             opener.add_handler(handler)
 374
 375         # Delete the default user-agent header, which would otherwise apply in
 376         # cases where our custom HTTP handler doesn't come into play
 377         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 378         opener.addheaders = []
 379         return opener
 380
 381     def _send(self, request):
 382         headers = self._merge_headers(request.headers)
 383         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 384         urllib_req = urllib.request.Request(
 385             url=request.url,
 386             data=request.data,
 387             headers=dict(headers),
 388             method=request.method,
 389         )
 390
 391         opener = self._get_instance(
 392             proxies=self._get_proxies(request),
 393             cookiejar=self._get_cookiejar(request),
 394         )
 395         try:
 396             res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
 397         except urllib.error.HTTPError as e:
 398             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 399                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 400                 e._closer.close_called = True
 401                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 402             raise  # unexpected
 403         except urllib.error.URLError as e:
 404             cause = e.reason  # NOTE: cause may be a string
 405
 406             # proxy errors
 407             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 408                 raise ProxyError(cause=e) from e
 409
 410             handle_response_read_exceptions(cause)
 411             raise TransportError(cause=e) from e
 412         except (http.client.InvalidURL, ValueError) as e:
 413             # Validation errors
 414             # http.client.HTTPConnection raises ValueError in some validation cases
 415             # such as if request method contains illegal control characters [1]
 416             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 417             raise RequestError(cause=e) from e
 418         except Exception as e:
 419             handle_response_read_exceptions(e)
 420             raise  # unexpected
 421
 422         return UrllibResponseAdapter(res)