yt_dlp/utils/networking.py

   1 import collections
   2 import random
   3 import urllib.parse
   4 import urllib.request
   5
   6 from ._utils import remove_start
   7
   8
   9 def random_user_agent():
  10     _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  11     _CHROME_VERSIONS = (
  12         '90.0.4430.212',
  13         '90.0.4430.24',
  14         '90.0.4430.70',
  15         '90.0.4430.72',
  16         '90.0.4430.85',
  17         '90.0.4430.93',
  18         '91.0.4472.101',
  19         '91.0.4472.106',
  20         '91.0.4472.114',
  21         '91.0.4472.124',
  22         '91.0.4472.164',
  23         '91.0.4472.19',
  24         '91.0.4472.77',
  25         '92.0.4515.107',
  26         '92.0.4515.115',
  27         '92.0.4515.131',
  28         '92.0.4515.159',
  29         '92.0.4515.43',
  30         '93.0.4556.0',
  31         '93.0.4577.15',
  32         '93.0.4577.63',
  33         '93.0.4577.82',
  34         '94.0.4606.41',
  35         '94.0.4606.54',
  36         '94.0.4606.61',
  37         '94.0.4606.71',
  38         '94.0.4606.81',
  39         '94.0.4606.85',
  40         '95.0.4638.17',
  41         '95.0.4638.50',
  42         '95.0.4638.54',
  43         '95.0.4638.69',
  44         '95.0.4638.74',
  45         '96.0.4664.18',
  46         '96.0.4664.45',
  47         '96.0.4664.55',
  48         '96.0.4664.93',
  49         '97.0.4692.20',
  50     )
  51     return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
  52
  53
  54 class HTTPHeaderDict(collections.UserDict, dict):
  55     """
  56     Store and access keys case-insensitively.
  57     The constructor can take multiple dicts, in which keys in the latter are prioritised.
  58     """
  59
  60     def __init__(self, *args, **kwargs):
  61         super().__init__()
  62         for dct in args:
  63             if dct is not None:
  64                 self.update(dct)
  65         self.update(kwargs)
  66
  67     def __setitem__(self, key, value):
  68         super().__setitem__(key.title(), str(value))
  69
  70     def __getitem__(self, key):
  71         return super().__getitem__(key.title())
  72
  73     def __delitem__(self, key):
  74         super().__delitem__(key.title())
  75
  76     def __contains__(self, key):
  77         return super().__contains__(key.title() if isinstance(key, str) else key)
  78
  79
  80 std_headers = HTTPHeaderDict({
  81     'User-Agent': random_user_agent(),
  82     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83     'Accept-Language': 'en-us,en;q=0.5',
  84     'Sec-Fetch-Mode': 'navigate',
  85 })
  86
  87
  88 def clean_proxies(proxies: dict, headers: HTTPHeaderDict):
  89     req_proxy = headers.pop('Ytdl-Request-Proxy', None)
  90     if req_proxy:
  91         proxies.clear()  # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY
  92         proxies['all'] = req_proxy
  93     for proxy_key, proxy_url in proxies.items():
  94         if proxy_url == '__noproxy__':
  95             proxies[proxy_key] = None
  96             continue
  97         if proxy_key == 'no':  # special case
  98             continue
  99         if proxy_url is not None:
 100             # Ensure proxies without a scheme are http.
 101             try:
 102                 proxy_scheme = urllib.request._parse_proxy(proxy_url)[0]
 103             except ValueError:
 104                 # Ignore invalid proxy URLs. Sometimes these may be introduced through environment
 105                 # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`.
 106                 # If the proxy is going to be used, the Request Handler proxy validation will handle it.
 107                 continue
 108             if proxy_scheme is None:
 109                 proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//')
 110
 111             replace_scheme = {
 112                 'socks5': 'socks5h',  # compat: socks5 was treated as socks5h
 113                 'socks': 'socks4'  # compat: non-standard
 114             }
 115             if proxy_scheme in replace_scheme:
 116                 proxies[proxy_key] = urllib.parse.urlunparse(
 117                     urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme]))
 118
 119
 120 def clean_headers(headers: HTTPHeaderDict):
 121     if 'Youtubedl-No-Compression' in headers:  # compat
 122         del headers['Youtubedl-No-Compression']
 123         headers['Accept-Encoding'] = 'identity'
 124
 125
 126 def remove_dot_segments(path):
 127     # Implements RFC3986 5.2.4 remote_dot_segments
 128     # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
 129     # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
 130     output = []
 131     segments = path.split('/')
 132     for s in segments:
 133         if s == '.':
 134             continue
 135         elif s == '..':
 136             if output:
 137                 output.pop()
 138         else:
 139             output.append(s)
 140     if not segments[0] and (not output or output[0]):
 141         output.insert(0, '')
 142     if segments[-1] in ('.', '..'):
 143         output.append('')
 144     return '/'.join(output)
 145
 146
 147 def escape_rfc3986(s):
 148     """Escape non-ASCII characters as suggested by RFC 3986"""
 149     return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
 150
 151
 152 def normalize_url(url):
 153     """Normalize URL as suggested by RFC 3986"""
 154     url_parsed = urllib.parse.urlparse(url)
 155     return url_parsed._replace(
 156         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
 157         path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
 158         params=escape_rfc3986(url_parsed.params),
 159         query=escape_rfc3986(url_parsed.query),
 160         fragment=escape_rfc3986(url_parsed.fragment)
 161     ).geturl()