yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import http.client
   5 import io
   6 import socket
   7 import ssl
   8 import urllib.error
   9 import urllib.parse
  10 import urllib.request
  11 import urllib.response
  12 import zlib
  13 from urllib.request import (
  14     DataHandler,
  15     FileHandler,
  16     FTPHandler,
  17     HTTPCookieProcessor,
  18     HTTPDefaultErrorHandler,
  19     HTTPErrorProcessor,
  20     UnknownHandler,
  21 )
  22
  23 from ._helper import (
  24     InstanceStoreMixin,
  25     add_accept_encoding_header,
  26     get_redirect_method,
  27     make_socks_proxy_opts,
  28     select_proxy,
  29 )
  30 from .common import Features, RequestHandler, Response, register_rh
  31 from .exceptions import (
  32     CertificateVerifyError,
  33     HTTPError,
  34     IncompleteRead,
  35     ProxyError,
  36     RequestError,
  37     SSLError,
  38     TransportError,
  39 )
  40 from ..dependencies import brotli
  41 from ..socks import ProxyError as SocksProxyError
  42 from ..socks import sockssocket
  43 from ..utils import update_url_query
  44 from ..utils.networking import normalize_url
  45
  46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  48
  49 if brotli:
  50     SUPPORTED_ENCODINGS.append('br')
  51     CONTENT_DECODE_ERRORS.append(brotli.error)
  52
  53
  54 def _create_http_connection(http_class, source_address, *args, **kwargs):
  55     hc = http_class(*args, **kwargs)
  56
  57     if source_address is not None:
  58         # This is to workaround _create_connection() from socket where it will try all
  59         # address data from getaddrinfo() including IPv6. This filters the result from
  60         # getaddrinfo() based on the source_address value.
  61         # This is based on the cpython socket.create_connection() function.
  62         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  63         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  64             host, port = address
  65             err = None
  66             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  67             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  68             ip_addrs = [addr for addr in addrs if addr[0] == af]
  69             if addrs and not ip_addrs:
  70                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
  71                 raise OSError(
  72                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
  73                     % (ip_version, source_address[0]))
  74             for res in ip_addrs:
  75                 af, socktype, proto, canonname, sa = res
  76                 sock = None
  77                 try:
  78                     sock = socket.socket(af, socktype, proto)
  79                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  80                         sock.settimeout(timeout)
  81                     sock.bind(source_address)
  82                     sock.connect(sa)
  83                     err = None  # Explicitly break reference cycle
  84                     return sock
  85                 except OSError as _:
  86                     err = _
  87                     if sock is not None:
  88                         sock.close()
  89             if err is not None:
  90                 raise err
  91             else:
  92                 raise OSError('getaddrinfo returns an empty list')
  93         if hasattr(hc, '_create_connection'):
  94             hc._create_connection = _create_connection
  95         hc.source_address = (source_address, 0)
  96
  97     return hc
  98
  99
 100 class HTTPHandler(urllib.request.AbstractHTTPHandler):
 101     """Handler for HTTP requests and responses.
 102
 103     This class, when installed with an OpenerDirector, automatically adds
 104     the standard headers to every HTTP request and handles gzipped, deflated and
 105     brotli responses from web servers.
 106
 107     Part of this code was copied from:
 108
 109     http://techknack.net/python-urllib2-handlers/
 110
 111     Andrew Rowls, the author of that code, agreed to release it to the
 112     public domain.
 113     """
 114
 115     def __init__(self, context=None, source_address=None, *args, **kwargs):
 116         super().__init__(*args, **kwargs)
 117         self._source_address = source_address
 118         self._context = context
 119
 120     @staticmethod
 121     def _make_conn_class(base, req):
 122         conn_class = base
 123         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
 124         if socks_proxy:
 125             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 126         return conn_class
 127
 128     def http_open(self, req):
 129         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
 130         return self.do_open(functools.partial(
 131             _create_http_connection, conn_class, self._source_address), req)
 132
 133     def https_open(self, req):
 134         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 135         return self.do_open(
 136             functools.partial(
 137                 _create_http_connection, conn_class, self._source_address),
 138             req, context=self._context)
 139
 140     @staticmethod
 141     def deflate(data):
 142         if not data:
 143             return data
 144         try:
 145             return zlib.decompress(data, -zlib.MAX_WBITS)
 146         except zlib.error:
 147             return zlib.decompress(data)
 148
 149     @staticmethod
 150     def brotli(data):
 151         if not data:
 152             return data
 153         return brotli.decompress(data)
 154
 155     @staticmethod
 156     def gz(data):
 157         # There may be junk added the end of the file
 158         # We ignore it by only ever decoding a single gzip payload
 159         if not data:
 160             return data
 161         return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
 162
 163     def http_request(self, req):
 164         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 165         # always respected by websites, some tend to give out URLs with non percent-encoded
 166         # non-ASCII characters (see telemb.py, ard.py [#3412])
 167         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 168         # To work around aforementioned issue we will replace request's original URL with
 169         # percent-encoded one
 170         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 171         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 172         url = req.get_full_url()
 173         url_escaped = normalize_url(url)
 174
 175         # Substitute URL if any change after escaping
 176         if url != url_escaped:
 177             req = update_Request(req, url=url_escaped)
 178
 179         return super().do_request_(req)
 180
 181     def http_response(self, req, resp):
 182         old_resp = resp
 183
 184         # Content-Encoding header lists the encodings in order that they were applied [1].
 185         # To decompress, we simply do the reverse.
 186         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 187         decoded_response = None
 188         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 189             if encoding == 'gzip':
 190                 decoded_response = self.gz(decoded_response or resp.read())
 191             elif encoding == 'deflate':
 192                 decoded_response = self.deflate(decoded_response or resp.read())
 193             elif encoding == 'br' and brotli:
 194                 decoded_response = self.brotli(decoded_response or resp.read())
 195
 196         if decoded_response is not None:
 197             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 198             resp.msg = old_resp.msg
 199         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 200         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 201         if 300 <= resp.code < 400:
 202             location = resp.headers.get('Location')
 203             if location:
 204                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 205                 location = location.encode('iso-8859-1').decode()
 206                 location_escaped = normalize_url(location)
 207                 if location != location_escaped:
 208                     del resp.headers['Location']
 209                     resp.headers['Location'] = location_escaped
 210         return resp
 211
 212     https_request = http_request
 213     https_response = http_response
 214
 215
 216 def make_socks_conn_class(base_class, socks_proxy):
 217     assert issubclass(base_class, (
 218         http.client.HTTPConnection, http.client.HTTPSConnection))
 219
 220     proxy_args = make_socks_proxy_opts(socks_proxy)
 221
 222     class SocksConnection(base_class):
 223         def connect(self):
 224             self.sock = sockssocket()
 225             self.sock.setproxy(**proxy_args)
 226             if type(self.timeout) in (int, float):  # noqa: E721
 227                 self.sock.settimeout(self.timeout)
 228             self.sock.connect((self.host, self.port))
 229
 230             if isinstance(self, http.client.HTTPSConnection):
 231                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 232
 233     return SocksConnection
 234
 235
 236 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 237     """YoutubeDL redirect handler
 238
 239     The code is based on HTTPRedirectHandler implementation from CPython [1].
 240
 241     This redirect handler fixes and improves the logic to better align with RFC7261
 242      and what browsers tend to do [2][3]
 243
 244     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 245     2. https://datatracker.ietf.org/doc/html/rfc7231
 246     3. https://github.com/python/cpython/issues/91306
 247     """
 248
 249     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 250
 251     def redirect_request(self, req, fp, code, msg, headers, newurl):
 252         if code not in (301, 302, 303, 307, 308):
 253             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 254
 255         new_data = req.data
 256
 257         # Technically the Cookie header should be in unredirected_hdrs,
 258         # however in practice some may set it in normal headers anyway.
 259         # We will remove it here to prevent any leaks.
 260         remove_headers = ['Cookie']
 261
 262         new_method = get_redirect_method(req.get_method(), code)
 263         # only remove payload if method changed (e.g. POST to GET)
 264         if new_method != req.get_method():
 265             new_data = None
 266             remove_headers.extend(['Content-Length', 'Content-Type'])
 267
 268         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 269
 270         return urllib.request.Request(
 271             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 272             unverifiable=True, method=new_method, data=new_data)
 273
 274
 275 class ProxyHandler(urllib.request.BaseHandler):
 276     handler_order = 100
 277
 278     def __init__(self, proxies=None):
 279         self.proxies = proxies
 280         # Set default handlers
 281         for type in ('http', 'https', 'ftp'):
 282             setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
 283
 284     def proxy_open(self, req):
 285         proxy = select_proxy(req.get_full_url(), self.proxies)
 286         if proxy is None:
 287             return
 288         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 289             req.add_header('Ytdl-socks-proxy', proxy)
 290             # yt-dlp's http/https handlers do wrapping the socket with socks
 291             return None
 292         return urllib.request.ProxyHandler.proxy_open(
 293             self, req, proxy, None)
 294
 295
 296 class PUTRequest(urllib.request.Request):
 297     def get_method(self):
 298         return 'PUT'
 299
 300
 301 class HEADRequest(urllib.request.Request):
 302     def get_method(self):
 303         return 'HEAD'
 304
 305
 306 def update_Request(req, url=None, data=None, headers=None, query=None):
 307     req_headers = req.headers.copy()
 308     req_headers.update(headers or {})
 309     req_data = data if data is not None else req.data
 310     req_url = update_url_query(url or req.get_full_url(), query)
 311     req_get_method = req.get_method()
 312     if req_get_method == 'HEAD':
 313         req_type = HEADRequest
 314     elif req_get_method == 'PUT':
 315         req_type = PUTRequest
 316     else:
 317         req_type = urllib.request.Request
 318     new_req = req_type(
 319         req_url, data=req_data, headers=req_headers,
 320         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 321     if hasattr(req, 'timeout'):
 322         new_req.timeout = req.timeout
 323     return new_req
 324
 325
 326 class UrllibResponseAdapter(Response):
 327     """
 328     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 329     """
 330
 331     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 332         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 333         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 334         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 335         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 336         super().__init__(
 337             fp=res, headers=res.headers, url=res.url,
 338             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 339
 340     def read(self, amt=None):
 341         try:
 342             return self.fp.read(amt)
 343         except Exception as e:
 344             handle_response_read_exceptions(e)
 345             raise e
 346
 347
 348 def handle_sslerror(e: ssl.SSLError):
 349     if not isinstance(e, ssl.SSLError):
 350         return
 351     if isinstance(e, ssl.SSLCertVerificationError):
 352         raise CertificateVerifyError(cause=e) from e
 353     raise SSLError(cause=e) from e
 354
 355
 356 def handle_response_read_exceptions(e):
 357     if isinstance(e, http.client.IncompleteRead):
 358         raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
 359     elif isinstance(e, ssl.SSLError):
 360         handle_sslerror(e)
 361     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 362         # OSErrors raised here should mostly be network related
 363         raise TransportError(cause=e) from e
 364
 365
 366 @register_rh
 367 class UrllibRH(RequestHandler, InstanceStoreMixin):
 368     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 369     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 370     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 371     RH_NAME = 'urllib'
 372
 373     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 374         super().__init__(**kwargs)
 375         self.enable_file_urls = enable_file_urls
 376         if self.enable_file_urls:
 377             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 378
 379     def _check_extensions(self, extensions):
 380         super()._check_extensions(extensions)
 381         extensions.pop('cookiejar', None)
 382         extensions.pop('timeout', None)
 383
 384     def _create_instance(self, proxies, cookiejar):
 385         opener = urllib.request.OpenerDirector()
 386         handlers = [
 387             ProxyHandler(proxies),
 388             HTTPHandler(
 389                 debuglevel=int(bool(self.verbose)),
 390                 context=self._make_sslcontext(),
 391                 source_address=self.source_address),
 392             HTTPCookieProcessor(cookiejar),
 393             DataHandler(),
 394             UnknownHandler(),
 395             HTTPDefaultErrorHandler(),
 396             FTPHandler(),
 397             HTTPErrorProcessor(),
 398             RedirectHandler(),
 399         ]
 400
 401         if self.enable_file_urls:
 402             handlers.append(FileHandler())
 403
 404         for handler in handlers:
 405             opener.add_handler(handler)
 406
 407         # Delete the default user-agent header, which would otherwise apply in
 408         # cases where our custom HTTP handler doesn't come into play
 409         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 410         opener.addheaders = []
 411         return opener
 412
 413     def _send(self, request):
 414         headers = self._merge_headers(request.headers)
 415         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 416         urllib_req = urllib.request.Request(
 417             url=request.url,
 418             data=request.data,
 419             headers=dict(headers),
 420             method=request.method
 421         )
 422
 423         opener = self._get_instance(
 424             proxies=request.proxies or self.proxies,
 425             cookiejar=request.extensions.get('cookiejar') or self.cookiejar
 426         )
 427         try:
 428             res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
 429         except urllib.error.HTTPError as e:
 430             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 431                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 432                 e._closer.file = None
 433                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 434             raise  # unexpected
 435         except urllib.error.URLError as e:
 436             cause = e.reason  # NOTE: cause may be a string
 437
 438             # proxy errors
 439             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 440                 raise ProxyError(cause=e) from e
 441
 442             handle_response_read_exceptions(cause)
 443             raise TransportError(cause=e) from e
 444         except (http.client.InvalidURL, ValueError) as e:
 445             # Validation errors
 446             # http.client.HTTPConnection raises ValueError in some validation cases
 447             # such as if request method contains illegal control characters [1]
 448             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 449             raise RequestError(cause=e) from e
 450         except Exception as e:
 451             handle_response_read_exceptions(e)
 452             raise  # unexpected
 453
 454         return UrllibResponseAdapter(res)