]>
Commit | Line | Data |
---|---|---|
227bf1a3 | 1 | import collections |
c365dba8 | 2 | import random |
227bf1a3 | 3 | import urllib.parse |
4 | import urllib.request | |
5 | ||
6 | from ._utils import remove_start | |
c365dba8 | 7 | |
8 | ||
9 | def random_user_agent(): | |
10 | _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' | |
11 | _CHROME_VERSIONS = ( | |
12 | '90.0.4430.212', | |
13 | '90.0.4430.24', | |
14 | '90.0.4430.70', | |
15 | '90.0.4430.72', | |
16 | '90.0.4430.85', | |
17 | '90.0.4430.93', | |
18 | '91.0.4472.101', | |
19 | '91.0.4472.106', | |
20 | '91.0.4472.114', | |
21 | '91.0.4472.124', | |
22 | '91.0.4472.164', | |
23 | '91.0.4472.19', | |
24 | '91.0.4472.77', | |
25 | '92.0.4515.107', | |
26 | '92.0.4515.115', | |
27 | '92.0.4515.131', | |
28 | '92.0.4515.159', | |
29 | '92.0.4515.43', | |
30 | '93.0.4556.0', | |
31 | '93.0.4577.15', | |
32 | '93.0.4577.63', | |
33 | '93.0.4577.82', | |
34 | '94.0.4606.41', | |
35 | '94.0.4606.54', | |
36 | '94.0.4606.61', | |
37 | '94.0.4606.71', | |
38 | '94.0.4606.81', | |
39 | '94.0.4606.85', | |
40 | '95.0.4638.17', | |
41 | '95.0.4638.50', | |
42 | '95.0.4638.54', | |
43 | '95.0.4638.69', | |
44 | '95.0.4638.74', | |
45 | '96.0.4664.18', | |
46 | '96.0.4664.45', | |
47 | '96.0.4664.55', | |
48 | '96.0.4664.93', | |
49 | '97.0.4692.20', | |
50 | ) | |
51 | return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) | |
52 | ||
53 | ||
227bf1a3 | 54 | class HTTPHeaderDict(collections.UserDict, dict): |
55 | """ | |
56 | Store and access keys case-insensitively. | |
57 | The constructor can take multiple dicts, in which keys in the latter are prioritised. | |
58 | """ | |
59 | ||
60 | def __init__(self, *args, **kwargs): | |
61 | super().__init__() | |
62 | for dct in args: | |
63 | if dct is not None: | |
64 | self.update(dct) | |
65 | self.update(kwargs) | |
66 | ||
67 | def __setitem__(self, key, value): | |
3f796510 | 68 | if isinstance(value, bytes): |
69 | value = value.decode('latin-1') | |
196eb0fe | 70 | super().__setitem__(key.title(), str(value).strip()) |
227bf1a3 | 71 | |
72 | def __getitem__(self, key): | |
73 | return super().__getitem__(key.title()) | |
74 | ||
75 | def __delitem__(self, key): | |
76 | super().__delitem__(key.title()) | |
77 | ||
78 | def __contains__(self, key): | |
79 | return super().__contains__(key.title() if isinstance(key, str) else key) | |
80 | ||
81 | ||
82 | std_headers = HTTPHeaderDict({ | |
c365dba8 | 83 | 'User-Agent': random_user_agent(), |
84 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
85 | 'Accept-Language': 'en-us,en;q=0.5', | |
86 | 'Sec-Fetch-Mode': 'navigate', | |
227bf1a3 | 87 | }) |
88 | ||
89 | ||
90 | def clean_proxies(proxies: dict, headers: HTTPHeaderDict): | |
91 | req_proxy = headers.pop('Ytdl-Request-Proxy', None) | |
92 | if req_proxy: | |
93 | proxies.clear() # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY | |
94 | proxies['all'] = req_proxy | |
95 | for proxy_key, proxy_url in proxies.items(): | |
96 | if proxy_url == '__noproxy__': | |
97 | proxies[proxy_key] = None | |
98 | continue | |
99 | if proxy_key == 'no': # special case | |
100 | continue | |
101 | if proxy_url is not None: | |
102 | # Ensure proxies without a scheme are http. | |
bbeacff7 | 103 | try: |
104 | proxy_scheme = urllib.request._parse_proxy(proxy_url)[0] | |
105 | except ValueError: | |
106 | # Ignore invalid proxy URLs. Sometimes these may be introduced through environment | |
107 | # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`. | |
108 | # If the proxy is going to be used, the Request Handler proxy validation will handle it. | |
109 | continue | |
227bf1a3 | 110 | if proxy_scheme is None: |
111 | proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//') | |
112 | ||
113 | replace_scheme = { | |
114 | 'socks5': 'socks5h', # compat: socks5 was treated as socks5h | |
add96eb9 | 115 | 'socks': 'socks4', # compat: non-standard |
227bf1a3 | 116 | } |
117 | if proxy_scheme in replace_scheme: | |
118 | proxies[proxy_key] = urllib.parse.urlunparse( | |
119 | urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme])) | |
c365dba8 | 120 | |
121 | ||
227bf1a3 | 122 | def clean_headers(headers: HTTPHeaderDict): |
123 | if 'Youtubedl-No-Compression' in headers: # compat | |
124 | del headers['Youtubedl-No-Compression'] | |
c365dba8 | 125 | headers['Accept-Encoding'] = 'identity' |
f04b5bed | 126 | headers.pop('Ytdl-socks-proxy', None) |
4bf91228 | 127 | |
128 | ||
129 | def remove_dot_segments(path): | |
130 | # Implements RFC3986 5.2.4 remote_dot_segments | |
131 | # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4 | |
132 | # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263 | |
133 | output = [] | |
134 | segments = path.split('/') | |
135 | for s in segments: | |
136 | if s == '.': | |
137 | continue | |
138 | elif s == '..': | |
139 | if output: | |
140 | output.pop() | |
141 | else: | |
142 | output.append(s) | |
143 | if not segments[0] and (not output or output[0]): | |
144 | output.insert(0, '') | |
145 | if segments[-1] in ('.', '..'): | |
146 | output.append('') | |
147 | return '/'.join(output) | |
148 | ||
149 | ||
150 | def escape_rfc3986(s): | |
151 | """Escape non-ASCII characters as suggested by RFC 3986""" | |
152 | return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") | |
153 | ||
154 | ||
155 | def normalize_url(url): | |
156 | """Normalize URL as suggested by RFC 3986""" | |
157 | url_parsed = urllib.parse.urlparse(url) | |
158 | return url_parsed._replace( | |
159 | netloc=url_parsed.netloc.encode('idna').decode('ascii'), | |
160 | path=escape_rfc3986(remove_dot_segments(url_parsed.path)), | |
161 | params=escape_rfc3986(url_parsed.params), | |
162 | query=escape_rfc3986(url_parsed.query), | |
add96eb9 | 163 | fragment=escape_rfc3986(url_parsed.fragment), |
4bf91228 | 164 | ).geturl() |