yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import gzip
   5 import http.client
   6 import io
   7 import socket
   8 import ssl
   9 import urllib.error
  10 import urllib.parse
  11 import urllib.request
  12 import urllib.response
  13 import zlib
  14 from urllib.request import (
  15     DataHandler,
  16     FileHandler,
  17     FTPHandler,
  18     HTTPCookieProcessor,
  19     HTTPDefaultErrorHandler,
  20     HTTPErrorProcessor,
  21     UnknownHandler,
  22 )
  23
  24 from ._helper import (
  25     InstanceStoreMixin,
  26     add_accept_encoding_header,
  27     get_redirect_method,
  28     make_socks_proxy_opts,
  29     select_proxy,
  30 )
  31 from .common import Features, RequestHandler, Response, register_rh
  32 from .exceptions import (
  33     CertificateVerifyError,
  34     HTTPError,
  35     IncompleteRead,
  36     ProxyError,
  37     RequestError,
  38     SSLError,
  39     TransportError,
  40 )
  41 from ..dependencies import brotli
  42 from ..socks import ProxyError as SocksProxyError
  43 from ..socks import sockssocket
  44 from ..utils import escape_url, update_url_query
  45
  46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  48
  49 if brotli:
  50     SUPPORTED_ENCODINGS.append('br')
  51     CONTENT_DECODE_ERRORS.append(brotli.error)
  52
  53
  54 def _create_http_connection(http_class, source_address, *args, **kwargs):
  55     hc = http_class(*args, **kwargs)
  56
  57     if source_address is not None:
  58         # This is to workaround _create_connection() from socket where it will try all
  59         # address data from getaddrinfo() including IPv6. This filters the result from
  60         # getaddrinfo() based on the source_address value.
  61         # This is based on the cpython socket.create_connection() function.
  62         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  63         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  64             host, port = address
  65             err = None
  66             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  67             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  68             ip_addrs = [addr for addr in addrs if addr[0] == af]
  69             if addrs and not ip_addrs:
  70                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
  71                 raise OSError(
  72                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
  73                     % (ip_version, source_address[0]))
  74             for res in ip_addrs:
  75                 af, socktype, proto, canonname, sa = res
  76                 sock = None
  77                 try:
  78                     sock = socket.socket(af, socktype, proto)
  79                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  80                         sock.settimeout(timeout)
  81                     sock.bind(source_address)
  82                     sock.connect(sa)
  83                     err = None  # Explicitly break reference cycle
  84                     return sock
  85                 except OSError as _:
  86                     err = _
  87                     if sock is not None:
  88                         sock.close()
  89             if err is not None:
  90                 raise err
  91             else:
  92                 raise OSError('getaddrinfo returns an empty list')
  93         if hasattr(hc, '_create_connection'):
  94             hc._create_connection = _create_connection
  95         hc.source_address = (source_address, 0)
  96
  97     return hc
  98
  99
 100 class HTTPHandler(urllib.request.AbstractHTTPHandler):
 101     """Handler for HTTP requests and responses.
 102
 103     This class, when installed with an OpenerDirector, automatically adds
 104     the standard headers to every HTTP request and handles gzipped, deflated and
 105     brotli responses from web servers.
 106
 107     Part of this code was copied from:
 108
 109     http://techknack.net/python-urllib2-handlers/
 110
 111     Andrew Rowls, the author of that code, agreed to release it to the
 112     public domain.
 113     """
 114
 115     def __init__(self, context=None, source_address=None, *args, **kwargs):
 116         super().__init__(*args, **kwargs)
 117         self._source_address = source_address
 118         self._context = context
 119
 120     @staticmethod
 121     def _make_conn_class(base, req):
 122         conn_class = base
 123         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
 124         if socks_proxy:
 125             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 126         return conn_class
 127
 128     def http_open(self, req):
 129         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
 130         return self.do_open(functools.partial(
 131             _create_http_connection, conn_class, self._source_address), req)
 132
 133     def https_open(self, req):
 134         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 135         return self.do_open(
 136             functools.partial(
 137                 _create_http_connection, conn_class, self._source_address),
 138             req, context=self._context)
 139
 140     @staticmethod
 141     def deflate(data):
 142         if not data:
 143             return data
 144         try:
 145             return zlib.decompress(data, -zlib.MAX_WBITS)
 146         except zlib.error:
 147             return zlib.decompress(data)
 148
 149     @staticmethod
 150     def brotli(data):
 151         if not data:
 152             return data
 153         return brotli.decompress(data)
 154
 155     @staticmethod
 156     def gz(data):
 157         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
 158         try:
 159             return gz.read()
 160         except OSError as original_oserror:
 161             # There may be junk add the end of the file
 162             # See http://stackoverflow.com/q/4928560/35070 for details
 163             for i in range(1, 1024):
 164                 try:
 165                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
 166                     return gz.read()
 167                 except OSError:
 168                     continue
 169             else:
 170                 raise original_oserror
 171
 172     def http_request(self, req):
 173         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 174         # always respected by websites, some tend to give out URLs with non percent-encoded
 175         # non-ASCII characters (see telemb.py, ard.py [#3412])
 176         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 177         # To work around aforementioned issue we will replace request's original URL with
 178         # percent-encoded one
 179         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 180         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 181         url = req.get_full_url()
 182         url_escaped = escape_url(url)
 183
 184         # Substitute URL if any change after escaping
 185         if url != url_escaped:
 186             req = update_Request(req, url=url_escaped)
 187
 188         return super().do_request_(req)
 189
 190     def http_response(self, req, resp):
 191         old_resp = resp
 192
 193         # Content-Encoding header lists the encodings in order that they were applied [1].
 194         # To decompress, we simply do the reverse.
 195         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 196         decoded_response = None
 197         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 198             if encoding == 'gzip':
 199                 decoded_response = self.gz(decoded_response or resp.read())
 200             elif encoding == 'deflate':
 201                 decoded_response = self.deflate(decoded_response or resp.read())
 202             elif encoding == 'br' and brotli:
 203                 decoded_response = self.brotli(decoded_response or resp.read())
 204
 205         if decoded_response is not None:
 206             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 207             resp.msg = old_resp.msg
 208         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 209         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 210         if 300 <= resp.code < 400:
 211             location = resp.headers.get('Location')
 212             if location:
 213                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 214                 location = location.encode('iso-8859-1').decode()
 215                 location_escaped = escape_url(location)
 216                 if location != location_escaped:
 217                     del resp.headers['Location']
 218                     resp.headers['Location'] = location_escaped
 219         return resp
 220
 221     https_request = http_request
 222     https_response = http_response
 223
 224
 225 def make_socks_conn_class(base_class, socks_proxy):
 226     assert issubclass(base_class, (
 227         http.client.HTTPConnection, http.client.HTTPSConnection))
 228
 229     proxy_args = make_socks_proxy_opts(socks_proxy)
 230
 231     class SocksConnection(base_class):
 232         def connect(self):
 233             self.sock = sockssocket()
 234             self.sock.setproxy(**proxy_args)
 235             if type(self.timeout) in (int, float):  # noqa: E721
 236                 self.sock.settimeout(self.timeout)
 237             self.sock.connect((self.host, self.port))
 238
 239             if isinstance(self, http.client.HTTPSConnection):
 240                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 241
 242     return SocksConnection
 243
 244
 245 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 246     """YoutubeDL redirect handler
 247
 248     The code is based on HTTPRedirectHandler implementation from CPython [1].
 249
 250     This redirect handler fixes and improves the logic to better align with RFC7261
 251      and what browsers tend to do [2][3]
 252
 253     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 254     2. https://datatracker.ietf.org/doc/html/rfc7231
 255     3. https://github.com/python/cpython/issues/91306
 256     """
 257
 258     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 259
 260     def redirect_request(self, req, fp, code, msg, headers, newurl):
 261         if code not in (301, 302, 303, 307, 308):
 262             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 263
 264         new_data = req.data
 265
 266         # Technically the Cookie header should be in unredirected_hdrs,
 267         # however in practice some may set it in normal headers anyway.
 268         # We will remove it here to prevent any leaks.
 269         remove_headers = ['Cookie']
 270
 271         new_method = get_redirect_method(req.get_method(), code)
 272         # only remove payload if method changed (e.g. POST to GET)
 273         if new_method != req.get_method():
 274             new_data = None
 275             remove_headers.extend(['Content-Length', 'Content-Type'])
 276
 277         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 278
 279         return urllib.request.Request(
 280             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 281             unverifiable=True, method=new_method, data=new_data)
 282
 283
 284 class ProxyHandler(urllib.request.BaseHandler):
 285     handler_order = 100
 286
 287     def __init__(self, proxies=None):
 288         self.proxies = proxies
 289         # Set default handlers
 290         for type in ('http', 'https', 'ftp'):
 291             setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
 292
 293     def proxy_open(self, req):
 294         proxy = select_proxy(req.get_full_url(), self.proxies)
 295         if proxy is None:
 296             return
 297         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 298             req.add_header('Ytdl-socks-proxy', proxy)
 299             # yt-dlp's http/https handlers do wrapping the socket with socks
 300             return None
 301         return urllib.request.ProxyHandler.proxy_open(
 302             self, req, proxy, None)
 303
 304
 305 class PUTRequest(urllib.request.Request):
 306     def get_method(self):
 307         return 'PUT'
 308
 309
 310 class HEADRequest(urllib.request.Request):
 311     def get_method(self):
 312         return 'HEAD'
 313
 314
 315 def update_Request(req, url=None, data=None, headers=None, query=None):
 316     req_headers = req.headers.copy()
 317     req_headers.update(headers or {})
 318     req_data = data if data is not None else req.data
 319     req_url = update_url_query(url or req.get_full_url(), query)
 320     req_get_method = req.get_method()
 321     if req_get_method == 'HEAD':
 322         req_type = HEADRequest
 323     elif req_get_method == 'PUT':
 324         req_type = PUTRequest
 325     else:
 326         req_type = urllib.request.Request
 327     new_req = req_type(
 328         req_url, data=req_data, headers=req_headers,
 329         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 330     if hasattr(req, 'timeout'):
 331         new_req.timeout = req.timeout
 332     return new_req
 333
 334
 335 class UrllibResponseAdapter(Response):
 336     """
 337     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 338     """
 339
 340     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 341         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 342         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 343         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 344         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 345         super().__init__(
 346             fp=res, headers=res.headers, url=res.url,
 347             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 348
 349     def read(self, amt=None):
 350         try:
 351             return self.fp.read(amt)
 352         except Exception as e:
 353             handle_response_read_exceptions(e)
 354             raise e
 355
 356
 357 def handle_sslerror(e: ssl.SSLError):
 358     if not isinstance(e, ssl.SSLError):
 359         return
 360     if isinstance(e, ssl.SSLCertVerificationError):
 361         raise CertificateVerifyError(cause=e) from e
 362     raise SSLError(cause=e) from e
 363
 364
 365 def handle_response_read_exceptions(e):
 366     if isinstance(e, http.client.IncompleteRead):
 367         raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
 368     elif isinstance(e, ssl.SSLError):
 369         handle_sslerror(e)
 370     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 371         # OSErrors raised here should mostly be network related
 372         raise TransportError(cause=e) from e
 373
 374
 375 @register_rh
 376 class UrllibRH(RequestHandler, InstanceStoreMixin):
 377     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 378     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 379     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 380     RH_NAME = 'urllib'
 381
 382     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 383         super().__init__(**kwargs)
 384         self.enable_file_urls = enable_file_urls
 385         if self.enable_file_urls:
 386             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 387
 388     def _create_instance(self, proxies, cookiejar):
 389         opener = urllib.request.OpenerDirector()
 390         handlers = [
 391             ProxyHandler(proxies),
 392             HTTPHandler(
 393                 debuglevel=int(bool(self.verbose)),
 394                 context=self._make_sslcontext(),
 395                 source_address=self.source_address),
 396             HTTPCookieProcessor(cookiejar),
 397             DataHandler(),
 398             UnknownHandler(),
 399             HTTPDefaultErrorHandler(),
 400             FTPHandler(),
 401             HTTPErrorProcessor(),
 402             RedirectHandler(),
 403         ]
 404
 405         if self.enable_file_urls:
 406             handlers.append(FileHandler())
 407
 408         for handler in handlers:
 409             opener.add_handler(handler)
 410
 411         # Delete the default user-agent header, which would otherwise apply in
 412         # cases where our custom HTTP handler doesn't come into play
 413         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 414         opener.addheaders = []
 415         return opener
 416
 417     def _send(self, request):
 418         headers = self._merge_headers(request.headers)
 419         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 420         urllib_req = urllib.request.Request(
 421             url=request.url,
 422             data=request.data,
 423             headers=dict(headers),
 424             method=request.method
 425         )
 426
 427         opener = self._get_instance(
 428             proxies=request.proxies or self.proxies,
 429             cookiejar=request.extensions.get('cookiejar') or self.cookiejar
 430         )
 431         try:
 432             res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
 433         except urllib.error.HTTPError as e:
 434             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 435                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 436                 e._closer.file = None
 437                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 438             raise  # unexpected
 439         except urllib.error.URLError as e:
 440             cause = e.reason  # NOTE: cause may be a string
 441
 442             # proxy errors
 443             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 444                 raise ProxyError(cause=e) from e
 445
 446             handle_response_read_exceptions(cause)
 447             raise TransportError(cause=e) from e
 448         except (http.client.InvalidURL, ValueError) as e:
 449             # Validation errors
 450             # http.client.HTTPConnection raises ValueError in some validation cases
 451             # such as if request method contains illegal control characters [1]
 452             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 453             raise RequestError(cause=e) from e
 454         except Exception as e:
 455             handle_response_read_exceptions(e)
 456             raise  # unexpected
 457
 458         return UrllibResponseAdapter(res)