yt_dlp/networking/_urllib.py

   1 from __future__ import annotations
   2
   3 import functools
   4 import gzip
   5 import http.client
   6 import io
   7 import socket
   8 import ssl
   9 import urllib.error
  10 import urllib.parse
  11 import urllib.request
  12 import urllib.response
  13 import zlib
  14 from urllib.request import (
  15     DataHandler,
  16     FileHandler,
  17     FTPHandler,
  18     HTTPCookieProcessor,
  19     HTTPDefaultErrorHandler,
  20     HTTPErrorProcessor,
  21     UnknownHandler,
  22 )
  23
  24 from ._helper import (
  25     InstanceStoreMixin,
  26     add_accept_encoding_header,
  27     get_redirect_method,
  28     make_socks_proxy_opts,
  29     select_proxy,
  30 )
  31 from .common import Features, RequestHandler, Response, register_rh
  32 from .exceptions import (
  33     CertificateVerifyError,
  34     HTTPError,
  35     IncompleteRead,
  36     ProxyError,
  37     RequestError,
  38     SSLError,
  39     TransportError,
  40 )
  41 from ..dependencies import brotli
  42 from ..socks import ProxyError as SocksProxyError
  43 from ..socks import sockssocket
  44 from ..utils import escape_url, update_url_query
  45
  46 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  47 CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  48
  49 if brotli:
  50     SUPPORTED_ENCODINGS.append('br')
  51     CONTENT_DECODE_ERRORS.append(brotli.error)
  52
  53
  54 def _create_http_connection(http_class, source_address, *args, **kwargs):
  55     hc = http_class(*args, **kwargs)
  56
  57     if source_address is not None:
  58         # This is to workaround _create_connection() from socket where it will try all
  59         # address data from getaddrinfo() including IPv6. This filters the result from
  60         # getaddrinfo() based on the source_address value.
  61         # This is based on the cpython socket.create_connection() function.
  62         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  63         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  64             host, port = address
  65             err = None
  66             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  67             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  68             ip_addrs = [addr for addr in addrs if addr[0] == af]
  69             if addrs and not ip_addrs:
  70                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
  71                 raise OSError(
  72                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
  73                     % (ip_version, source_address[0]))
  74             for res in ip_addrs:
  75                 af, socktype, proto, canonname, sa = res
  76                 sock = None
  77                 try:
  78                     sock = socket.socket(af, socktype, proto)
  79                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  80                         sock.settimeout(timeout)
  81                     sock.bind(source_address)
  82                     sock.connect(sa)
  83                     err = None  # Explicitly break reference cycle
  84                     return sock
  85                 except OSError as _:
  86                     err = _
  87                     if sock is not None:
  88                         sock.close()
  89             if err is not None:
  90                 raise err
  91             else:
  92                 raise OSError('getaddrinfo returns an empty list')
  93         if hasattr(hc, '_create_connection'):
  94             hc._create_connection = _create_connection
  95         hc.source_address = (source_address, 0)
  96
  97     return hc
  98
  99
 100 class HTTPHandler(urllib.request.AbstractHTTPHandler):
 101     """Handler for HTTP requests and responses.
 102
 103     This class, when installed with an OpenerDirector, automatically adds
 104     the standard headers to every HTTP request and handles gzipped, deflated and
 105     brotli responses from web servers.
 106
 107     Part of this code was copied from:
 108
 109     http://techknack.net/python-urllib2-handlers/
 110
 111     Andrew Rowls, the author of that code, agreed to release it to the
 112     public domain.
 113     """
 114
 115     def __init__(self, context=None, source_address=None, *args, **kwargs):
 116         super().__init__(*args, **kwargs)
 117         self._source_address = source_address
 118         self._context = context
 119
 120     @staticmethod
 121     def _make_conn_class(base, req):
 122         conn_class = base
 123         socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
 124         if socks_proxy:
 125             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 126         return conn_class
 127
 128     def http_open(self, req):
 129         conn_class = self._make_conn_class(http.client.HTTPConnection, req)
 130         return self.do_open(functools.partial(
 131             _create_http_connection, conn_class, self._source_address), req)
 132
 133     def https_open(self, req):
 134         conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
 135         return self.do_open(
 136             functools.partial(
 137                 _create_http_connection, conn_class, self._source_address),
 138             req, context=self._context)
 139
 140     @staticmethod
 141     def deflate(data):
 142         if not data:
 143             return data
 144         try:
 145             return zlib.decompress(data, -zlib.MAX_WBITS)
 146         except zlib.error:
 147             return zlib.decompress(data)
 148
 149     @staticmethod
 150     def brotli(data):
 151         if not data:
 152             return data
 153         return brotli.decompress(data)
 154
 155     @staticmethod
 156     def gz(data):
 157         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
 158         try:
 159             return gz.read()
 160         except OSError as original_oserror:
 161             # There may be junk add the end of the file
 162             # See http://stackoverflow.com/q/4928560/35070 for details
 163             for i in range(1, 1024):
 164                 try:
 165                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
 166                     return gz.read()
 167                 except OSError:
 168                     continue
 169             else:
 170                 raise original_oserror
 171
 172     def http_request(self, req):
 173         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 174         # always respected by websites, some tend to give out URLs with non percent-encoded
 175         # non-ASCII characters (see telemb.py, ard.py [#3412])
 176         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 177         # To work around aforementioned issue we will replace request's original URL with
 178         # percent-encoded one
 179         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 180         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 181         url = req.get_full_url()
 182         url_escaped = escape_url(url)
 183
 184         # Substitute URL if any change after escaping
 185         if url != url_escaped:
 186             req = update_Request(req, url=url_escaped)
 187
 188         return super().do_request_(req)
 189
 190     def http_response(self, req, resp):
 191         old_resp = resp
 192
 193         # Content-Encoding header lists the encodings in order that they were applied [1].
 194         # To decompress, we simply do the reverse.
 195         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 196         decoded_response = None
 197         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 198             if encoding == 'gzip':
 199                 decoded_response = self.gz(decoded_response or resp.read())
 200             elif encoding == 'deflate':
 201                 decoded_response = self.deflate(decoded_response or resp.read())
 202             elif encoding == 'br' and brotli:
 203                 decoded_response = self.brotli(decoded_response or resp.read())
 204
 205         if decoded_response is not None:
 206             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 207             resp.msg = old_resp.msg
 208         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 209         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 210         if 300 <= resp.code < 400:
 211             location = resp.headers.get('Location')
 212             if location:
 213                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 214                 location = location.encode('iso-8859-1').decode()
 215                 location_escaped = escape_url(location)
 216                 if location != location_escaped:
 217                     del resp.headers['Location']
 218                     resp.headers['Location'] = location_escaped
 219         return resp
 220
 221     https_request = http_request
 222     https_response = http_response
 223
 224
 225 def make_socks_conn_class(base_class, socks_proxy):
 226     assert issubclass(base_class, (
 227         http.client.HTTPConnection, http.client.HTTPSConnection))
 228
 229     proxy_args = make_socks_proxy_opts(socks_proxy)
 230
 231     class SocksConnection(base_class):
 232         def connect(self):
 233             self.sock = sockssocket()
 234             self.sock.setproxy(**proxy_args)
 235             if type(self.timeout) in (int, float):  # noqa: E721
 236                 self.sock.settimeout(self.timeout)
 237             self.sock.connect((self.host, self.port))
 238
 239             if isinstance(self, http.client.HTTPSConnection):
 240                 self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
 241
 242     return SocksConnection
 243
 244
 245 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 246     """YoutubeDL redirect handler
 247
 248     The code is based on HTTPRedirectHandler implementation from CPython [1].
 249
 250     This redirect handler fixes and improves the logic to better align with RFC7261
 251      and what browsers tend to do [2][3]
 252
 253     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 254     2. https://datatracker.ietf.org/doc/html/rfc7231
 255     3. https://github.com/python/cpython/issues/91306
 256     """
 257
 258     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 259
 260     def redirect_request(self, req, fp, code, msg, headers, newurl):
 261         if code not in (301, 302, 303, 307, 308):
 262             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 263
 264         new_data = req.data
 265
 266         # Technically the Cookie header should be in unredirected_hdrs,
 267         # however in practice some may set it in normal headers anyway.
 268         # We will remove it here to prevent any leaks.
 269         remove_headers = ['Cookie']
 270
 271         new_method = get_redirect_method(req.get_method(), code)
 272         # only remove payload if method changed (e.g. POST to GET)
 273         if new_method != req.get_method():
 274             new_data = None
 275             remove_headers.extend(['Content-Length', 'Content-Type'])
 276
 277         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 278
 279         return urllib.request.Request(
 280             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 281             unverifiable=True, method=new_method, data=new_data)
 282
 283
 284 class ProxyHandler(urllib.request.BaseHandler):
 285     handler_order = 100
 286
 287     def __init__(self, proxies=None):
 288         self.proxies = proxies
 289         # Set default handlers
 290         for type in ('http', 'https', 'ftp'):
 291             setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
 292
 293     def proxy_open(self, req):
 294         proxy = select_proxy(req.get_full_url(), self.proxies)
 295         if proxy is None:
 296             return
 297         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
 298             req.add_header('Ytdl-socks-proxy', proxy)
 299             # yt-dlp's http/https handlers do wrapping the socket with socks
 300             return None
 301         return urllib.request.ProxyHandler.proxy_open(
 302             self, req, proxy, None)
 303
 304
 305 class PUTRequest(urllib.request.Request):
 306     def get_method(self):
 307         return 'PUT'
 308
 309
 310 class HEADRequest(urllib.request.Request):
 311     def get_method(self):
 312         return 'HEAD'
 313
 314
 315 def update_Request(req, url=None, data=None, headers=None, query=None):
 316     req_headers = req.headers.copy()
 317     req_headers.update(headers or {})
 318     req_data = data if data is not None else req.data
 319     req_url = update_url_query(url or req.get_full_url(), query)
 320     req_get_method = req.get_method()
 321     if req_get_method == 'HEAD':
 322         req_type = HEADRequest
 323     elif req_get_method == 'PUT':
 324         req_type = PUTRequest
 325     else:
 326         req_type = urllib.request.Request
 327     new_req = req_type(
 328         req_url, data=req_data, headers=req_headers,
 329         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 330     if hasattr(req, 'timeout'):
 331         new_req.timeout = req.timeout
 332     return new_req
 333
 334
 335 class UrllibResponseAdapter(Response):
 336     """
 337     HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
 338     """
 339
 340     def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
 341         # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
 342         # HTTPResponse: .getcode() was deprecated, .status always existed [2]
 343         # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
 344         # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
 345         super().__init__(
 346             fp=res, headers=res.headers, url=res.url,
 347             status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
 348
 349     def read(self, amt=None):
 350         try:
 351             return self.fp.read(amt)
 352         except Exception as e:
 353             handle_response_read_exceptions(e)
 354             raise e
 355
 356
 357 def handle_sslerror(e: ssl.SSLError):
 358     if not isinstance(e, ssl.SSLError):
 359         return
 360     if isinstance(e, ssl.SSLCertVerificationError):
 361         raise CertificateVerifyError(cause=e) from e
 362     raise SSLError(cause=e) from e
 363
 364
 365 def handle_response_read_exceptions(e):
 366     if isinstance(e, http.client.IncompleteRead):
 367         raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e
 368     elif isinstance(e, ssl.SSLError):
 369         handle_sslerror(e)
 370     elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
 371         # OSErrors raised here should mostly be network related
 372         raise TransportError(cause=e) from e
 373
 374
 375 @register_rh
 376 class UrllibRH(RequestHandler, InstanceStoreMixin):
 377     _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
 378     _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
 379     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
 380     RH_NAME = 'urllib'
 381
 382     def __init__(self, *, enable_file_urls: bool = False, **kwargs):
 383         super().__init__(**kwargs)
 384         self.enable_file_urls = enable_file_urls
 385         if self.enable_file_urls:
 386             self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
 387
 388     def _check_extensions(self, extensions):
 389         super()._check_extensions(extensions)
 390         extensions.pop('cookiejar', None)
 391         extensions.pop('timeout', None)
 392
 393     def _create_instance(self, proxies, cookiejar):
 394         opener = urllib.request.OpenerDirector()
 395         handlers = [
 396             ProxyHandler(proxies),
 397             HTTPHandler(
 398                 debuglevel=int(bool(self.verbose)),
 399                 context=self._make_sslcontext(),
 400                 source_address=self.source_address),
 401             HTTPCookieProcessor(cookiejar),
 402             DataHandler(),
 403             UnknownHandler(),
 404             HTTPDefaultErrorHandler(),
 405             FTPHandler(),
 406             HTTPErrorProcessor(),
 407             RedirectHandler(),
 408         ]
 409
 410         if self.enable_file_urls:
 411             handlers.append(FileHandler())
 412
 413         for handler in handlers:
 414             opener.add_handler(handler)
 415
 416         # Delete the default user-agent header, which would otherwise apply in
 417         # cases where our custom HTTP handler doesn't come into play
 418         # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
 419         opener.addheaders = []
 420         return opener
 421
 422     def _send(self, request):
 423         headers = self._merge_headers(request.headers)
 424         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
 425         urllib_req = urllib.request.Request(
 426             url=request.url,
 427             data=request.data,
 428             headers=dict(headers),
 429             method=request.method
 430         )
 431
 432         opener = self._get_instance(
 433             proxies=request.proxies or self.proxies,
 434             cookiejar=request.extensions.get('cookiejar') or self.cookiejar
 435         )
 436         try:
 437             res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
 438         except urllib.error.HTTPError as e:
 439             if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
 440                 # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
 441                 e._closer.file = None
 442                 raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
 443             raise  # unexpected
 444         except urllib.error.URLError as e:
 445             cause = e.reason  # NOTE: cause may be a string
 446
 447             # proxy errors
 448             if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
 449                 raise ProxyError(cause=e) from e
 450
 451             handle_response_read_exceptions(cause)
 452             raise TransportError(cause=e) from e
 453         except (http.client.InvalidURL, ValueError) as e:
 454             # Validation errors
 455             # http.client.HTTPConnection raises ValueError in some validation cases
 456             # such as if request method contains illegal control characters [1]
 457             # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
 458             raise RequestError(cause=e) from e
 459         except Exception as e:
 460             handle_response_read_exceptions(e)
 461             raise  # unexpected
 462
 463         return UrllibResponseAdapter(res)