[yt-dlp.git] / yt_dlp / utils / networking.py

import collections
import random
import urllib.parse
import urllib.request

from ._utils import remove_start


def random_user_agent():
    _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
    _CHROME_VERSIONS = (
        '90.0.4430.212',
        '90.0.4430.24',
        '90.0.4430.70',
        '90.0.4430.72',
        '90.0.4430.85',
        '90.0.4430.93',
        '91.0.4472.101',
        '91.0.4472.106',
        '91.0.4472.114',
        '91.0.4472.124',
        '91.0.4472.164',
        '91.0.4472.19',
        '91.0.4472.77',
        '92.0.4515.107',
        '92.0.4515.115',
        '92.0.4515.131',
        '92.0.4515.159',
        '92.0.4515.43',
        '93.0.4556.0',
        '93.0.4577.15',
        '93.0.4577.63',
        '93.0.4577.82',
        '94.0.4606.41',
        '94.0.4606.54',
        '94.0.4606.61',
        '94.0.4606.71',
        '94.0.4606.81',
        '94.0.4606.85',
        '95.0.4638.17',
        '95.0.4638.50',
        '95.0.4638.54',
        '95.0.4638.69',
        '95.0.4638.74',
        '96.0.4664.18',
        '96.0.4664.45',
        '96.0.4664.55',
        '96.0.4664.93',
        '97.0.4692.20',
    )
    return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)


class HTTPHeaderDict(collections.UserDict, dict):
    """
    Store and access keys case-insensitively.
    The constructor can take multiple dicts, in which keys in the latter are prioritised.
    """

    def __init__(self, *args, **kwargs):
        super().__init__()
        for dct in args:
            if dct is not None:
                self.update(dct)
        self.update(kwargs)

    def __setitem__(self, key, value):
        if isinstance(value, bytes):
            value = value.decode('latin-1')
        super().__setitem__(key.title(), str(value).strip())

    def __getitem__(self, key):
        return super().__getitem__(key.title())

    def __delitem__(self, key):
        super().__delitem__(key.title())

    def __contains__(self, key):
        return super().__contains__(key.title() if isinstance(key, str) else key)


std_headers = HTTPHeaderDict({
    'User-Agent': random_user_agent(),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-us,en;q=0.5',
    'Sec-Fetch-Mode': 'navigate',
})


def clean_proxies(proxies: dict, headers: HTTPHeaderDict):
    req_proxy = headers.pop('Ytdl-Request-Proxy', None)
    if req_proxy:
        proxies.clear()  # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY
        proxies['all'] = req_proxy
    for proxy_key, proxy_url in proxies.items():
        if proxy_url == '__noproxy__':
            proxies[proxy_key] = None
            continue
        if proxy_key == 'no':  # special case
            continue
        if proxy_url is not None:
            # Ensure proxies without a scheme are http.
            try:
                proxy_scheme = urllib.request._parse_proxy(proxy_url)[0]
            except ValueError:
                # Ignore invalid proxy URLs. Sometimes these may be introduced through environment
                # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`.
                # If the proxy is going to be used, the Request Handler proxy validation will handle it.
                continue
            if proxy_scheme is None:
                proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//')

            replace_scheme = {
                'socks5': 'socks5h',  # compat: socks5 was treated as socks5h
                'socks': 'socks4',  # compat: non-standard
            }
            if proxy_scheme in replace_scheme:
                proxies[proxy_key] = urllib.parse.urlunparse(
                    urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme]))


def clean_headers(headers: HTTPHeaderDict):
    if 'Youtubedl-No-Compression' in headers:  # compat
        del headers['Youtubedl-No-Compression']
        headers['Accept-Encoding'] = 'identity'
    headers.pop('Ytdl-socks-proxy', None)


def remove_dot_segments(path):
    # Implements RFC3986 5.2.4 remote_dot_segments
    # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
    # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
    output = []
    segments = path.split('/')
    for s in segments:
        if s == '.':
            continue
        elif s == '..':
            if output:
                output.pop()
        else:
            output.append(s)
    if not segments[0] and (not output or output[0]):
        output.insert(0, '')
    if segments[-1] in ('.', '..'):
        output.append('')
    return '/'.join(output)


def escape_rfc3986(s):
    """Escape non-ASCII characters as suggested by RFC 3986"""
    return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")


def normalize_url(url):
    """Normalize URL as suggested by RFC 3986"""
    url_parsed = urllib.parse.urlparse(url)
    return url_parsed._replace(
        netloc=url_parsed.netloc.encode('idna').decode('ascii'),
        path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
        params=escape_rfc3986(url_parsed.params),
        query=escape_rfc3986(url_parsed.query),
        fragment=escape_rfc3986(url_parsed.fragment),
    ).geturl()
Commit	Line	Data
227bf1a3	1	import collections
c365dba8	2	import random
227bf1a3	3	import urllib.parse
	4	import urllib.request
	5
	6	from ._utils import remove_start
c365dba8	7
	8
	9	def random_user_agent():
	10	_USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
	11	_CHROME_VERSIONS = (
	12	'90.0.4430.212',
	13	'90.0.4430.24',
	14	'90.0.4430.70',
	15	'90.0.4430.72',
	16	'90.0.4430.85',
	17	'90.0.4430.93',
	18	'91.0.4472.101',
	19	'91.0.4472.106',
	20	'91.0.4472.114',
	21	'91.0.4472.124',
	22	'91.0.4472.164',
	23	'91.0.4472.19',
	24	'91.0.4472.77',
	25	'92.0.4515.107',
	26	'92.0.4515.115',
	27	'92.0.4515.131',
	28	'92.0.4515.159',
	29	'92.0.4515.43',
	30	'93.0.4556.0',
	31	'93.0.4577.15',
	32	'93.0.4577.63',
	33	'93.0.4577.82',
	34	'94.0.4606.41',
	35	'94.0.4606.54',
	36	'94.0.4606.61',
	37	'94.0.4606.71',
	38	'94.0.4606.81',
	39	'94.0.4606.85',
	40	'95.0.4638.17',
	41	'95.0.4638.50',
	42	'95.0.4638.54',
	43	'95.0.4638.69',
	44	'95.0.4638.74',
	45	'96.0.4664.18',
	46	'96.0.4664.45',
	47	'96.0.4664.55',
	48	'96.0.4664.93',
	49	'97.0.4692.20',
	50	)
	51	return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
	52
	53
227bf1a3	54	class HTTPHeaderDict(collections.UserDict, dict):
	55	"""
	56	Store and access keys case-insensitively.
	57	The constructor can take multiple dicts, in which keys in the latter are prioritised.
	58	"""
	59
	60	def __init__(self, args, *kwargs):
	61	super().__init__()
	62	for dct in args:
	63	if dct is not None:
	64	self.update(dct)
	65	self.update(kwargs)
	66
	67	def __setitem__(self, key, value):
3f796510	68	if isinstance(value, bytes):
3f796510	69	value = value.decode('latin-1')
196eb0fe	70	super().__setitem__(key.title(), str(value).strip())
227bf1a3	71
	72	def __getitem__(self, key):
	73	return super().__getitem__(key.title())
	74
	75	def __delitem__(self, key):
	76	super().__delitem__(key.title())
	77
	78	def __contains__(self, key):
	79	return super().__contains__(key.title() if isinstance(key, str) else key)
	80
	81
	82	std_headers = HTTPHeaderDict({
c365dba8	83	'User-Agent': random_user_agent(),
	84	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	85	'Accept-Language': 'en-us,en;q=0.5',
	86	'Sec-Fetch-Mode': 'navigate',
227bf1a3	87	})
	88
	89
	90	def clean_proxies(proxies: dict, headers: HTTPHeaderDict):
	91	req_proxy = headers.pop('Ytdl-Request-Proxy', None)
	92	if req_proxy:
	93	proxies.clear() # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY
	94	proxies['all'] = req_proxy
	95	for proxy_key, proxy_url in proxies.items():
	96	if proxy_url == '__noproxy__':
	97	proxies[proxy_key] = None
	98	continue
	99	if proxy_key == 'no': # special case
	100	continue
	101	if proxy_url is not None:
	102	# Ensure proxies without a scheme are http.
bbeacff7	103	try:
	104	proxy_scheme = urllib.request._parse_proxy(proxy_url)[0]
	105	except ValueError:
	106	# Ignore invalid proxy URLs. Sometimes these may be introduced through environment
	107	# variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`.
	108	# If the proxy is going to be used, the Request Handler proxy validation will handle it.
	109	continue
227bf1a3	110	if proxy_scheme is None:
	111	proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//')
	112
	113	replace_scheme = {
	114	'socks5': 'socks5h', # compat: socks5 was treated as socks5h
add96eb9	115	'socks': 'socks4', # compat: non-standard
227bf1a3	116	}
	117	if proxy_scheme in replace_scheme:
	118	proxies[proxy_key] = urllib.parse.urlunparse(
	119	urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme]))
c365dba8	120
c365dba8	121
227bf1a3	122	def clean_headers(headers: HTTPHeaderDict):
	123	if 'Youtubedl-No-Compression' in headers: # compat
	124	del headers['Youtubedl-No-Compression']
c365dba8	125	headers['Accept-Encoding'] = 'identity'
f04b5bed	126	headers.pop('Ytdl-socks-proxy', None)
4bf91228	127
	128
	129	def remove_dot_segments(path):
	130	# Implements RFC3986 5.2.4 remote_dot_segments
	131	# Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
	132	# https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
	133	output = []
	134	segments = path.split('/')
	135	for s in segments:
	136	if s == '.':
	137	continue
	138	elif s == '..':
	139	if output:
	140	output.pop()
	141	else:
	142	output.append(s)
	143	if not segments[0] and (not output or output[0]):
	144	output.insert(0, '')
	145	if segments[-1] in ('.', '..'):
	146	output.append('')
	147	return '/'.join(output)
	148
	149
	150	def escape_rfc3986(s):
	151	"""Escape non-ASCII characters as suggested by RFC 3986"""
	152	return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
	153
	154
	155	def normalize_url(url):
	156	"""Normalize URL as suggested by RFC 3986"""
	157	url_parsed = urllib.parse.urlparse(url)
	158	return url_parsed._replace(
	159	netloc=url_parsed.netloc.encode('idna').decode('ascii'),
	160	path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
	161	params=escape_rfc3986(url_parsed.params),
	162	query=escape_rfc3986(url_parsed.query),
add96eb9	163	fragment=escape_rfc3986(url_parsed.fragment),
4bf91228	164	).geturl()