yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import gzip
   5 import http.client
   6 import io
   7 import socket
   8 import ssl
   9 import urllib.error
  10 import urllib.parse
  11 import urllib.request
  12 import urllib.response
  13 import zlib
  14 from urllib.request import (
  15     DataHandler,
  16     FileHandler,
  17     FTPHandler,
  18     HTTPCookieProcessor,
  19     HTTPDefaultErrorHandler,
  20     HTTPErrorProcessor,
  21     UnknownHandler,
  22 )
  23
  24 from ._helper import (
  25     InstanceStoreMixin,
  26     add_accept_encoding_header,
  27     get_redirect_method,
  28     make_socks_proxy_opts,
  29     select_proxy,
  30 )
  31 from .common import Features, RequestHandler, Response, register_rh
  32 from .exceptions import (
  33     CertificateVerifyError,
  34     HTTPError,
  35     IncompleteRead,
  36     ProxyError,
  37     RequestError,
  38     SSLError,
  39     TransportError,
  40 )
  41 from ..dependencies import brotli
  42 from ..socks import ProxyError as SocksProxyError
  43 from ..socks import sockssocket
  44 from ..utils import update_url_query
  45 from ..utils.networking import normalize_url
  46
  47 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  48 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  49
  50 if brotli:
  51     SUPPORTED_ENCODINGS.append('br')
  52     CONTENT_DECODE_ERRORS.append(brotli.error)
  53
  54
  55 def _create_http_connection(http_class, source_address, *args, **kwargs):
  56     hc = http_class(*args, **kwargs)
  57
  58     if source_address is not None:
  59         # This is to workaround _create_connection() from socket where it will try all
  60         # address data from getaddrinfo() including IPv6. This filters the result from
  61         # getaddrinfo() based on the source_address value.
  62         # This is based on the cpython socket.create_connection() function.
  63         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  64         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  65             host, port = address
  66             err = None
  67             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  68             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  69             ip_addrs = [addr for addr in addrs if addr[0] == af]
  70             if addrs and not ip_addrs:
  71                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
  72                 raise OSError(
  73                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
  74                     % (ip_version, source_address[0]))
  75             for res in ip_addrs:
  76                 af, socktype, proto, canonname, sa = res
  77                 sock = None
  78                 try:
  79                     sock = socket.socket(af, socktype, proto)
  80                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  81                         sock.settimeout(timeout)
  82                     sock.bind(source_address)
  83                     sock.connect(sa)
  84                     err = None  # Explicitly break reference cycle
  85                     return sock
  86                 except OSError as _:
  87                     err = _
  88                     if sock is not None:
  89                         sock.close()
  90             if err is not None:
  91                 raise err
  92             else:
  93                 raise OSError('getaddrinfo returns an empty list')
  94         if hasattr(hc, '_create_connection'):
  95             hc._create_connection = _create_connection
  96         hc.source_address = (source_address, 0)
  97
  98     return hc
  99
 100
 101 class HTTPHandler(urllib.request.AbstractHTTPHandler):
 102     """Handler for HTTP requests and responses.
 103
 104     This class, when installed with an OpenerDirector, automatically adds
 105     the standard headers to every HTTP request and handles gzipped, deflated and
 106     brotli responses from web servers.
 107
 108     Part of this code was copied from:
 109
 110     http://techknack.net/python-urllib2-handlers/
 111
 112     Andrew Rowls, the author of that code, agreed to release it to the
 113     public domain.
 114     """
 115
 116     def __init__(self, context=None, source_address=None, *args, **kwargs):
 117         super().__init__(*args, **kwargs)
 118         self._source_address = source_address
 119         self._context = context
 120
 121     @staticmethod
 122     def _make_conn_class(base, req):
 123         conn_class = base
 124         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
 125         if socks_proxy:
 126             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 127         return conn_class
 128
 129     def http_open(self, req):
 130         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
 131         return self.do_open(functools.partial(
 132             _create_http_connection, conn_class, self._source_address), req)
 133
 134     def https_open(self, req):
 135         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 136         return self.do_open(
 137             functools.partial(
 138                 _create_http_connection, conn_class, self._source_address),
 139             req, context=self._context)
 140
 141     @staticmethod
 142     def deflate(data):
 143         if not data:
 144             return data
 145         try:
 146             return zlib.decompress(data, -zlib.MAX_WBITS)
 147         except zlib.error:
 148             return zlib.decompress(data)
 149
 150     @staticmethod
 151     def brotli(data):
 152         if not data:
 153             return data
 154         return brotli.decompress(data)
 155
 156     @staticmethod
 157     def gz(data):
 158         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
 159         try:
 160             return gz.read()
 161         except OSError as original_oserror:
 162             # There may be junk add the end of the file
 163             # See http://stackoverflow.com/q/4928560/35070 for details
 164             for i in range(1, 1024):
 165                 try:
 166                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
 167                     return gz.read()
 168                 except OSError:
 169                     continue
 170             else:
 171                 raise original_oserror
 172
 173     def http_request(self, req):
 174         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 175         # always respected by websites, some tend to give out URLs with non percent-encoded
 176         # non-ASCII characters (see telemb.py, ard.py [#3412])
 177         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 178         # To work around aforementioned issue we will replace request's original URL with
 179         # percent-encoded one
 180         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 181         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 182         url = req.get_full_url()
 183         url_escaped = normalize_url(url)
 184
 185         # Substitute URL if any change after escaping
 186         if url != url_escaped:
 187             req = update_Request(req, url=url_escaped)
 188
 189         return super().do_request_(req)
 190
 191     def http_response(self, req, resp):
 192         old_resp = resp
 193
 194         # Content-Encoding header lists the encodings in order that they were applied [1].
 195         # To decompress, we simply do the reverse.
 196         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 197         decoded_response = None
 198         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 199             if encoding == 'gzip':
 200                 decoded_response = self.gz(decoded_response or resp.read())
 201             elif encoding == 'deflate':
 202                 decoded_response = self.deflate(decoded_response or resp.read())
 203             elif encoding == 'br' and brotli:
 204                 decoded_response = self.brotli(decoded_response or resp.read())
 205
 206         if decoded_response is not None:
 207             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 208             resp.msg = old_resp.msg
 209         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 210         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 211         if 300 <= resp.code < 400:
 212             location = resp.headers.get('Location')
 213             if location:
 214                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 215                 location = location.encode('iso-8859-1').decode()
 216                 location_escaped = normalize_url(location)
 217                 if location != location_escaped:
 218                     del resp.headers['Location']
 219                     resp.headers['Location'] = location_escaped
 220         return resp
 221
 222     https_request = http_request
 223     https_response = http_response
 224
 225
 226 def make_socks_conn_class(base_class, socks_proxy):
 227     assert issubclass(base_class, (
 228         http.client.HTTPConnection, http.client.HTTPSConnection))
 229
 230     proxy_args = make_socks_proxy_opts(socks_proxy)
 231
 232     class SocksConnection(base_class):
 233         def connect(self):
 234             self.sock = sockssocket()
 235             self.sock.setproxy(**proxy_args)
 236             if type(self.timeout) in (int, float):  # noqa: E721
 237                 self.sock.settimeout(self.timeout)
 238             self.sock.connect((self.host, self.port))
 239
 240             if isinstance(self, http.client.HTTPSConnection):
 241                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 242
 243     return SocksConnection
 244
 245
 246 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 247     """YoutubeDL redirect handler
 248
 249     The code is based on HTTPRedirectHandler implementation from CPython [1].
 250
 251     This redirect handler fixes and improves the logic to better align with RFC7261
 252      and what browsers tend to do [2][3]
 253
 254     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 255     2. https://datatracker.ietf.org/doc/html/rfc7231
 256     3. https://github.com/python/cpython/issues/91306
 257     """
 258
 259     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 260
 261     def redirect_request(self, req, fp, code, msg, headers, newurl):
 262         if code not in (301, 302, 303, 307, 308):
 263             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 264
 265         new_data = req.data
 266
 267         # Technically the Cookie header should be in unredirected_hdrs,
 268         # however in practice some may set it in normal headers anyway.
 269         # We will remove it here to prevent any leaks.
 270         remove_headers = ['Cookie']
 271
 272         new_method = get_redirect_method(req.get_method(), code)
 273         # only remove payload if method changed (e.g. POST to GET)
 274         if new_method != req.get_method():
 275             new_data = None
 276             remove_headers.extend(['Content-Length', 'Content-Type'])
 277
 278         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 279
 280         return urllib.request.Request(
 281             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 282             unverifiable=True, method=new_method, data=new_data)
 283
 284
 285 class ProxyHandler(urllib.request.BaseHandler):
 286     handler_order = 100
 287
 288     def __init__(self, proxies=None):
 289         self.proxies = proxies
 290         # Set default handlers
 291         for type in ('http', 'https', 'ftp'):
 292             setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
 293
 294     def proxy_open(self, req):
 295         proxy = select_proxy(req.get_full_url(), self.proxies)
 296         if proxy is None:
 297             return
 298         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 299             req.add_header('Ytdl-socks-proxy', proxy)
 300             # yt-dlp's http/https handlers do wrapping the socket with socks
 301             return None
 302         return urllib.request.ProxyHandler.proxy_open(
 303             self, req, proxy, None)
 304
 305
 306 class PUTRequest(urllib.request.Request):
 307     def get_method(self):
 308         return 'PUT'
 309
 310
 311 class HEADRequest(urllib.request.Request):
 312     def get_method(self):
 313         return 'HEAD'
 314
 315
 316 def update_Request(req, url=None, data=None, headers=None, query=None):
 317     req_headers = req.headers.copy()
 318     req_headers.update(headers or {})
 319     req_data = data if data is not None else req.data
 320     req_url = update_url_query(url or req.get_full_url(), query)
 321     req_get_method = req.get_method()
 322     if req_get_method == 'HEAD':
 323         req_type = HEADRequest
 324     elif req_get_method == 'PUT':
 325         req_type = PUTRequest
 326     else:
 327         req_type = urllib.request.Request
 328     new_req = req_type(
 329         req_url, data=req_data, headers=req_headers,
 330         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 331     if hasattr(req, 'timeout'):
 332         new_req.timeout = req.timeout
 333     return new_req
 334
 335
 336 class UrllibResponseAdapter(Response):
 337     """
 338     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 339     """
 340
 341     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 342         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 343         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 344         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 345         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 346         super().__init__(
 347             fp=res, headers=res.headers, url=res.url,
 348             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 349
 350     def read(self, amt=None):
 351         try:
 352             return self.fp.read(amt)
 353         except Exception as e:
 354             handle_response_read_exceptions(e)
 355             raise e
 356
 357
 358 def handle_sslerror(e: ssl.SSLError):
 359     if not isinstance(e, ssl.SSLError):
 360         return
 361     if isinstance(e, ssl.SSLCertVerificationError):
 362         raise CertificateVerifyError(cause=e) from e
 363     raise SSLError(cause=e) from e
 364
 365
 366 def handle_response_read_exceptions(e):
 367     if isinstance(e, http.client.IncompleteRead):
 368         raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
 369     elif isinstance(e, ssl.SSLError):
 370         handle_sslerror(e)
 371     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 372         # OSErrors raised here should mostly be network related
 373         raise TransportError(cause=e) from e
 374
 375
 376 @register_rh
 377 class UrllibRH(RequestHandler, InstanceStoreMixin):
 378     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 379     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 380     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 381     RH_NAME = 'urllib'
 382
 383     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 384         super().__init__(**kwargs)
 385         self.enable_file_urls = enable_file_urls
 386         if self.enable_file_urls:
 387             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 388
 389     def _check_extensions(self, extensions):
 390         super()._check_extensions(extensions)
 391         extensions.pop('cookiejar', None)
 392         extensions.pop('timeout', None)
 393
 394     def _create_instance(self, proxies, cookiejar):
 395         opener = urllib.request.OpenerDirector()
 396         handlers = [
 397             ProxyHandler(proxies),
 398             HTTPHandler(
 399                 debuglevel=int(bool(self.verbose)),
 400                 context=self._make_sslcontext(),
 401                 source_address=self.source_address),
 402             HTTPCookieProcessor(cookiejar),
 403             DataHandler(),
 404             UnknownHandler(),
 405             HTTPDefaultErrorHandler(),
 406             FTPHandler(),
 407             HTTPErrorProcessor(),
 408             RedirectHandler(),
 409         ]
 410
 411         if self.enable_file_urls:
 412             handlers.append(FileHandler())
 413
 414         for handler in handlers:
 415             opener.add_handler(handler)
 416
 417         # Delete the default user-agent header, which would otherwise apply in
 418         # cases where our custom HTTP handler doesn't come into play
 419         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 420         opener.addheaders = []
 421         return opener
 422
 423     def _send(self, request):
 424         headers = self._merge_headers(request.headers)
 425         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 426         urllib_req = urllib.request.Request(
 427             url=request.url,
 428             data=request.data,
 429             headers=dict(headers),
 430             method=request.method
 431         )
 432
 433         opener = self._get_instance(
 434             proxies=request.proxies or self.proxies,
 435             cookiejar=request.extensions.get('cookiejar') or self.cookiejar
 436         )
 437         try:
 438             res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
 439         except urllib.error.HTTPError as e:
 440             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 441                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 442                 e._closer.file = None
 443                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 444             raise  # unexpected
 445         except urllib.error.URLError as e:
 446             cause = e.reason  # NOTE: cause may be a string
 447
 448             # proxy errors
 449             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 450                 raise ProxyError(cause=e) from e
 451
 452             handle_response_read_exceptions(cause)
 453             raise TransportError(cause=e) from e
 454         except (http.client.InvalidURL, ValueError) as e:
 455             # Validation errors
 456             # http.client.HTTPConnection raises ValueError in some validation cases
 457             # such as if request method contains illegal control characters [1]
 458             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 459             raise RequestError(cause=e) from e
 460         except Exception as e:
 461             handle_response_read_exceptions(e)
 462             raise  # unexpected
 463
 464         return UrllibResponseAdapter(res)