yt_dlp/networking/_urllib.py

   1 import functools
   2 import gzip
   3 import http.client
   4 import io
   5 import socket
   6 import ssl
   7 import urllib.error
   8 import urllib.parse
   9 import urllib.request
  10 import urllib.response
  11 import zlib
  12
  13 from ._helper import (
  14     add_accept_encoding_header,
  15     get_redirect_method,
  16     make_socks_proxy_opts,
  17 )
  18 from ..dependencies import brotli
  19 from ..socks import sockssocket
  20 from ..utils import escape_url, update_url_query
  21 from ..utils.networking import clean_headers, std_headers
  22
  23 SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  24
  25 if brotli:
  26     SUPPORTED_ENCODINGS.append('br')
  27
  28
  29 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
  30     hc = http_class(*args, **kwargs)
  31     source_address = ydl_handler._params.get('source_address')
  32
  33     if source_address is not None:
  34         # This is to workaround _create_connection() from socket where it will try all
  35         # address data from getaddrinfo() including IPv6. This filters the result from
  36         # getaddrinfo() based on the source_address value.
  37         # This is based on the cpython socket.create_connection() function.
  38         # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
  39         def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
  40             host, port = address
  41             err = None
  42             addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
  43             af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
  44             ip_addrs = [addr for addr in addrs if addr[0] == af]
  45             if addrs and not ip_addrs:
  46                 ip_version = 'v4' if af == socket.AF_INET else 'v6'
  47                 raise OSError(
  48                     "No remote IP%s addresses available for connect, can't use '%s' as source address"
  49                     % (ip_version, source_address[0]))
  50             for res in ip_addrs:
  51                 af, socktype, proto, canonname, sa = res
  52                 sock = None
  53                 try:
  54                     sock = socket.socket(af, socktype, proto)
  55                     if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
  56                         sock.settimeout(timeout)
  57                     sock.bind(source_address)
  58                     sock.connect(sa)
  59                     err = None  # Explicitly break reference cycle
  60                     return sock
  61                 except OSError as _:
  62                     err = _
  63                     if sock is not None:
  64                         sock.close()
  65             if err is not None:
  66                 raise err
  67             else:
  68                 raise OSError('getaddrinfo returns an empty list')
  69         if hasattr(hc, '_create_connection'):
  70             hc._create_connection = _create_connection
  71         hc.source_address = (source_address, 0)
  72
  73     return hc
  74
  75
  76 class HTTPHandler(urllib.request.HTTPHandler):
  77     """Handler for HTTP requests and responses.
  78
  79     This class, when installed with an OpenerDirector, automatically adds
  80     the standard headers to every HTTP request and handles gzipped, deflated and
  81     brotli responses from web servers.
  82
  83     Part of this code was copied from:
  84
  85     http://techknack.net/python-urllib2-handlers/
  86
  87     Andrew Rowls, the author of that code, agreed to release it to the
  88     public domain.
  89     """
  90
  91     def __init__(self, params, *args, **kwargs):
  92         urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
  93         self._params = params
  94
  95     def http_open(self, req):
  96         conn_class = http.client.HTTPConnection
  97
  98         socks_proxy = req.headers.get('Ytdl-socks-proxy')
  99         if socks_proxy:
 100             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 101             del req.headers['Ytdl-socks-proxy']
 102
 103         return self.do_open(functools.partial(
 104             _create_http_connection, self, conn_class, False),
 105             req)
 106
 107     @staticmethod
 108     def deflate(data):
 109         if not data:
 110             return data
 111         try:
 112             return zlib.decompress(data, -zlib.MAX_WBITS)
 113         except zlib.error:
 114             return zlib.decompress(data)
 115
 116     @staticmethod
 117     def brotli(data):
 118         if not data:
 119             return data
 120         return brotli.decompress(data)
 121
 122     @staticmethod
 123     def gz(data):
 124         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
 125         try:
 126             return gz.read()
 127         except OSError as original_oserror:
 128             # There may be junk add the end of the file
 129             # See http://stackoverflow.com/q/4928560/35070 for details
 130             for i in range(1, 1024):
 131                 try:
 132                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
 133                     return gz.read()
 134                 except OSError:
 135                     continue
 136             else:
 137                 raise original_oserror
 138
 139     def http_request(self, req):
 140         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 141         # always respected by websites, some tend to give out URLs with non percent-encoded
 142         # non-ASCII characters (see telemb.py, ard.py [#3412])
 143         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 144         # To work around aforementioned issue we will replace request's original URL with
 145         # percent-encoded one
 146         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 147         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 148         url = req.get_full_url()
 149         url_escaped = escape_url(url)
 150
 151         # Substitute URL if any change after escaping
 152         if url != url_escaped:
 153             req = update_Request(req, url=url_escaped)
 154
 155         for h, v in self._params.get('http_headers', std_headers).items():
 156             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 157             # The dict keys are capitalized because of this bug by urllib
 158             if h.capitalize() not in req.headers:
 159                 req.add_header(h, v)
 160
 161         clean_headers(req.headers)
 162         add_accept_encoding_header(req.headers, SUPPORTED_ENCODINGS)
 163         return super().do_request_(req)
 164
 165     def http_response(self, req, resp):
 166         old_resp = resp
 167
 168         # Content-Encoding header lists the encodings in order that they were applied [1].
 169         # To decompress, we simply do the reverse.
 170         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
 171         decoded_response = None
 172         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
 173             if encoding == 'gzip':
 174                 decoded_response = self.gz(decoded_response or resp.read())
 175             elif encoding == 'deflate':
 176                 decoded_response = self.deflate(decoded_response or resp.read())
 177             elif encoding == 'br' and brotli:
 178                 decoded_response = self.brotli(decoded_response or resp.read())
 179
 180         if decoded_response is not None:
 181             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
 182             resp.msg = old_resp.msg
 183         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 184         # https://github.com/ytdl-org/youtube-dl/issues/6457).
 185         if 300 <= resp.code < 400:
 186             location = resp.headers.get('Location')
 187             if location:
 188                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 189                 location = location.encode('iso-8859-1').decode()
 190                 location_escaped = escape_url(location)
 191                 if location != location_escaped:
 192                     del resp.headers['Location']
 193                     resp.headers['Location'] = location_escaped
 194         return resp
 195
 196     https_request = http_request
 197     https_response = http_response
 198
 199
 200 def make_socks_conn_class(base_class, socks_proxy):
 201     assert issubclass(base_class, (
 202         http.client.HTTPConnection, http.client.HTTPSConnection))
 203
 204     proxy_args = make_socks_proxy_opts(socks_proxy)
 205
 206     class SocksConnection(base_class):
 207         def connect(self):
 208             self.sock = sockssocket()
 209             self.sock.setproxy(**proxy_args)
 210             if isinstance(self.timeout, (int, float)):
 211                 self.sock.settimeout(self.timeout)
 212             self.sock.connect((self.host, self.port))
 213
 214             if isinstance(self, http.client.HTTPSConnection):
 215                 if hasattr(self, '_context'):  # Python > 2.6
 216                     self.sock = self._context.wrap_socket(
 217                         self.sock, server_hostname=self.host)
 218                 else:
 219                     self.sock = ssl.wrap_socket(self.sock)
 220
 221     return SocksConnection
 222
 223
 224 class RedirectHandler(urllib.request.HTTPRedirectHandler):
 225     """YoutubeDL redirect handler
 226
 227     The code is based on HTTPRedirectHandler implementation from CPython [1].
 228
 229     This redirect handler fixes and improves the logic to better align with RFC7261
 230      and what browsers tend to do [2][3]
 231
 232     1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
 233     2. https://datatracker.ietf.org/doc/html/rfc7231
 234     3. https://github.com/python/cpython/issues/91306
 235     """
 236
 237     http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
 238
 239     def redirect_request(self, req, fp, code, msg, headers, newurl):
 240         if code not in (301, 302, 303, 307, 308):
 241             raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
 242
 243         new_data = req.data
 244
 245         # Technically the Cookie header should be in unredirected_hdrs,
 246         # however in practice some may set it in normal headers anyway.
 247         # We will remove it here to prevent any leaks.
 248         remove_headers = ['Cookie']
 249
 250         new_method = get_redirect_method(req.get_method(), code)
 251         # only remove payload if method changed (e.g. POST to GET)
 252         if new_method != req.get_method():
 253             new_data = None
 254             remove_headers.extend(['Content-Length', 'Content-Type'])
 255
 256         new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
 257
 258         return urllib.request.Request(
 259             newurl, headers=new_headers, origin_req_host=req.origin_req_host,
 260             unverifiable=True, method=new_method, data=new_data)
 261
 262
 263 class ProxyHandler(urllib.request.ProxyHandler):
 264     def __init__(self, proxies=None):
 265         # Set default handlers
 266         for type in ('http', 'https'):
 267             setattr(self, '%s_open' % type,
 268                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
 269                         meth(r, proxy, type))
 270         urllib.request.ProxyHandler.__init__(self, proxies)
 271
 272     def proxy_open(self, req, proxy, type):
 273         req_proxy = req.headers.get('Ytdl-request-proxy')
 274         if req_proxy is not None:
 275             proxy = req_proxy
 276             del req.headers['Ytdl-request-proxy']
 277
 278         if proxy == '__noproxy__':
 279             return None  # No Proxy
 280         if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
 281             req.add_header('Ytdl-socks-proxy', proxy)
 282             # yt-dlp's http/https handlers do wrapping the socket with socks
 283             return None
 284         return urllib.request.ProxyHandler.proxy_open(
 285             self, req, proxy, type)
 286
 287
 288 class PUTRequest(urllib.request.Request):
 289     def get_method(self):
 290         return 'PUT'
 291
 292
 293 class HEADRequest(urllib.request.Request):
 294     def get_method(self):
 295         return 'HEAD'
 296
 297
 298 def update_Request(req, url=None, data=None, headers=None, query=None):
 299     req_headers = req.headers.copy()
 300     req_headers.update(headers or {})
 301     req_data = data or req.data
 302     req_url = update_url_query(url or req.get_full_url(), query)
 303     req_get_method = req.get_method()
 304     if req_get_method == 'HEAD':
 305         req_type = HEADRequest
 306     elif req_get_method == 'PUT':
 307         req_type = PUTRequest
 308     else:
 309         req_type = urllib.request.Request
 310     new_req = req_type(
 311         req_url, data=req_data, headers=req_headers,
 312         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 313     if hasattr(req, 'timeout'):
 314         new_req.timeout = req.timeout
 315     return new_req