]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/networking.py
[networking] Remove dot segments during URL normalization (#7662)
[yt-dlp.git] / yt_dlp / utils / networking.py
1 import collections
2 import random
3 import urllib.parse
4 import urllib.request
5
6 from ._utils import remove_start
7
8
9 def random_user_agent():
10 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
11 _CHROME_VERSIONS = (
12 '90.0.4430.212',
13 '90.0.4430.24',
14 '90.0.4430.70',
15 '90.0.4430.72',
16 '90.0.4430.85',
17 '90.0.4430.93',
18 '91.0.4472.101',
19 '91.0.4472.106',
20 '91.0.4472.114',
21 '91.0.4472.124',
22 '91.0.4472.164',
23 '91.0.4472.19',
24 '91.0.4472.77',
25 '92.0.4515.107',
26 '92.0.4515.115',
27 '92.0.4515.131',
28 '92.0.4515.159',
29 '92.0.4515.43',
30 '93.0.4556.0',
31 '93.0.4577.15',
32 '93.0.4577.63',
33 '93.0.4577.82',
34 '94.0.4606.41',
35 '94.0.4606.54',
36 '94.0.4606.61',
37 '94.0.4606.71',
38 '94.0.4606.81',
39 '94.0.4606.85',
40 '95.0.4638.17',
41 '95.0.4638.50',
42 '95.0.4638.54',
43 '95.0.4638.69',
44 '95.0.4638.74',
45 '96.0.4664.18',
46 '96.0.4664.45',
47 '96.0.4664.55',
48 '96.0.4664.93',
49 '97.0.4692.20',
50 )
51 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
52
53
54 class HTTPHeaderDict(collections.UserDict, dict):
55 """
56 Store and access keys case-insensitively.
57 The constructor can take multiple dicts, in which keys in the latter are prioritised.
58 """
59
60 def __init__(self, *args, **kwargs):
61 super().__init__()
62 for dct in args:
63 if dct is not None:
64 self.update(dct)
65 self.update(kwargs)
66
67 def __setitem__(self, key, value):
68 super().__setitem__(key.title(), str(value))
69
70 def __getitem__(self, key):
71 return super().__getitem__(key.title())
72
73 def __delitem__(self, key):
74 super().__delitem__(key.title())
75
76 def __contains__(self, key):
77 return super().__contains__(key.title() if isinstance(key, str) else key)
78
79
80 std_headers = HTTPHeaderDict({
81 'User-Agent': random_user_agent(),
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Language': 'en-us,en;q=0.5',
84 'Sec-Fetch-Mode': 'navigate',
85 })
86
87
88 def clean_proxies(proxies: dict, headers: HTTPHeaderDict):
89 req_proxy = headers.pop('Ytdl-Request-Proxy', None)
90 if req_proxy:
91 proxies.clear() # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY
92 proxies['all'] = req_proxy
93 for proxy_key, proxy_url in proxies.items():
94 if proxy_url == '__noproxy__':
95 proxies[proxy_key] = None
96 continue
97 if proxy_key == 'no': # special case
98 continue
99 if proxy_url is not None:
100 # Ensure proxies without a scheme are http.
101 try:
102 proxy_scheme = urllib.request._parse_proxy(proxy_url)[0]
103 except ValueError:
104 # Ignore invalid proxy URLs. Sometimes these may be introduced through environment
105 # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`.
106 # If the proxy is going to be used, the Request Handler proxy validation will handle it.
107 continue
108 if proxy_scheme is None:
109 proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//')
110
111 replace_scheme = {
112 'socks5': 'socks5h', # compat: socks5 was treated as socks5h
113 'socks': 'socks4' # compat: non-standard
114 }
115 if proxy_scheme in replace_scheme:
116 proxies[proxy_key] = urllib.parse.urlunparse(
117 urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme]))
118
119
120 def clean_headers(headers: HTTPHeaderDict):
121 if 'Youtubedl-No-Compression' in headers: # compat
122 del headers['Youtubedl-No-Compression']
123 headers['Accept-Encoding'] = 'identity'
124
125
126 def remove_dot_segments(path):
127 # Implements RFC3986 5.2.4 remote_dot_segments
128 # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
129 # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
130 output = []
131 segments = path.split('/')
132 for s in segments:
133 if s == '.':
134 continue
135 elif s == '..':
136 if output:
137 output.pop()
138 else:
139 output.append(s)
140 if not segments[0] and (not output or output[0]):
141 output.insert(0, '')
142 if segments[-1] in ('.', '..'):
143 output.append('')
144 return '/'.join(output)
145
146
147 def escape_rfc3986(s):
148 """Escape non-ASCII characters as suggested by RFC 3986"""
149 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
150
151
152 def normalize_url(url):
153 """Normalize URL as suggested by RFC 3986"""
154 url_parsed = urllib.parse.urlparse(url)
155 return url_parsed._replace(
156 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
157 path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
158 params=escape_rfc3986(url_parsed.params),
159 query=escape_rfc3986(url_parsed.query),
160 fragment=escape_rfc3986(url_parsed.fragment)
161 ).geturl()