jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/networking/

... / ...

Commit	Line	Data
	1	from __future__ import annotations
	2
	3	import functools
	4	import http.client
	5	import io
	6	import ssl
	7	import urllib.error
	8	import urllib.parse
	9	import urllib.request
	10	import urllib.response
	11	import zlib
	12	from urllib.request import (
	13	DataHandler,
	14	FileHandler,
	15	FTPHandler,
	16	HTTPCookieProcessor,
	17	HTTPDefaultErrorHandler,
	18	HTTPErrorProcessor,
	19	UnknownHandler,
	20	)
	21
	22	from ._helper import (
	23	InstanceStoreMixin,
	24	add_accept_encoding_header,
	25	create_connection,
	26	create_socks_proxy_socket,
	27	get_redirect_method,
	28	make_socks_proxy_opts,
	29	select_proxy,
	30	)
	31	from .common import Features, RequestHandler, Response, register_rh
	32	from .exceptions import (
	33	CertificateVerifyError,
	34	HTTPError,
	35	IncompleteRead,
	36	ProxyError,
	37	RequestError,
	38	SSLError,
	39	TransportError,
	40	)
	41	from ..dependencies import brotli
	42	from ..socks import ProxyError as SocksProxyError
	43	from ..utils import update_url_query
	44	from ..utils.networking import normalize_url
	45
	46	SUPPORTED_ENCODINGS = ['gzip', 'deflate']
	47	CONTENT_DECODE_ERRORS = [zlib.error, OSError]
	48
	49	if brotli:
	50	SUPPORTED_ENCODINGS.append('br')
	51	CONTENT_DECODE_ERRORS.append(brotli.error)
	52
	53
	54	def _create_http_connection(http_class, source_address, args, *kwargs):
	55	hc = http_class(args, *kwargs)
	56
	57	if hasattr(hc, '_create_connection'):
	58	hc._create_connection = create_connection
	59
	60	if source_address is not None:
	61	hc.source_address = (source_address, 0)
	62
	63	return hc
	64
	65
	66	class HTTPHandler(urllib.request.AbstractHTTPHandler):
	67	"""Handler for HTTP requests and responses.
	68
	69	This class, when installed with an OpenerDirector, automatically adds
	70	the standard headers to every HTTP request and handles gzipped, deflated and
	71	brotli responses from web servers.
	72
	73	Part of this code was copied from:
	74
	75	http://techknack.net/python-urllib2-handlers/
	76
	77	Andrew Rowls, the author of that code, agreed to release it to the
	78	public domain.
	79	"""
	80
	81	def __init__(self, context=None, source_address=None, args, *kwargs):
	82	super().__init__(args, *kwargs)
	83	self._source_address = source_address
	84	self._context = context
	85
	86	@staticmethod
	87	def _make_conn_class(base, req):
	88	conn_class = base
	89	socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
	90	if socks_proxy:
	91	conn_class = make_socks_conn_class(conn_class, socks_proxy)
	92	return conn_class
	93
	94	def http_open(self, req):
	95	conn_class = self._make_conn_class(http.client.HTTPConnection, req)
	96	return self.do_open(functools.partial(
	97	_create_http_connection, conn_class, self._source_address), req)
	98
	99	def https_open(self, req):
	100	conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
	101	return self.do_open(
	102	functools.partial(
	103	_create_http_connection, conn_class, self._source_address),
	104	req, context=self._context)
	105
	106	@staticmethod
	107	def deflate(data):
	108	if not data:
	109	return data
	110	try:
	111	return zlib.decompress(data, -zlib.MAX_WBITS)
	112	except zlib.error:
	113	return zlib.decompress(data)
	114
	115	@staticmethod
	116	def brotli(data):
	117	if not data:
	118	return data
	119	return brotli.decompress(data)
	120
	121	@staticmethod
	122	def gz(data):
	123	# There may be junk added the end of the file
	124	# We ignore it by only ever decoding a single gzip payload
	125	if not data:
	126	return data
	127	return zlib.decompress(data, wbits=zlib.MAX_WBITS \| 16)
	128
	129	def http_request(self, req):
	130	# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
	131	# always respected by websites, some tend to give out URLs with non percent-encoded
	132	# non-ASCII characters (see telemb.py, ard.py [#3412])
	133	# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
	134	# To work around aforementioned issue we will replace request's original URL with
	135	# percent-encoded one
	136	# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
	137	# the code of this workaround has been moved here from YoutubeDL.urlopen()
	138	url = req.get_full_url()
	139	url_escaped = normalize_url(url)
	140
	141	# Substitute URL if any change after escaping
	142	if url != url_escaped:
	143	req = update_Request(req, url=url_escaped)
	144
	145	return super().do_request_(req)
	146
	147	def http_response(self, req, resp):
	148	old_resp = resp
	149
	150	# Content-Encoding header lists the encodings in order that they were applied [1].
	151	# To decompress, we simply do the reverse.
	152	# [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
	153	decoded_response = None
	154	for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
	155	if encoding == 'gzip':
	156	decoded_response = self.gz(decoded_response or resp.read())
	157	elif encoding == 'deflate':
	158	decoded_response = self.deflate(decoded_response or resp.read())
	159	elif encoding == 'br' and brotli:
	160	decoded_response = self.brotli(decoded_response or resp.read())
	161
	162	if decoded_response is not None:
	163	resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
	164	resp.msg = old_resp.msg
	165	# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
	166	# https://github.com/ytdl-org/youtube-dl/issues/6457).
	167	if 300 <= resp.code < 400:
	168	location = resp.headers.get('Location')
	169	if location:
	170	# As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
	171	location = location.encode('iso-8859-1').decode()
	172	location_escaped = normalize_url(location)
	173	if location != location_escaped:
	174	del resp.headers['Location']
	175	resp.headers['Location'] = location_escaped
	176	return resp
	177
	178	https_request = http_request
	179	https_response = http_response
	180
	181
	182	def make_socks_conn_class(base_class, socks_proxy):
	183	assert issubclass(base_class, (
	184	http.client.HTTPConnection, http.client.HTTPSConnection))
	185
	186	proxy_args = make_socks_proxy_opts(socks_proxy)
	187
	188	class SocksConnection(base_class):
	189	_create_connection = create_connection
	190
	191	def connect(self):
	192	self.sock = create_connection(
	193	(proxy_args['addr'], proxy_args['port']),
	194	timeout=self.timeout,
	195	source_address=self.source_address,
	196	_create_socket_func=functools.partial(
	197	create_socks_proxy_socket, (self.host, self.port), proxy_args))
	198	if isinstance(self, http.client.HTTPSConnection):
	199	self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
	200
	201	return SocksConnection
	202
	203
	204	class RedirectHandler(urllib.request.HTTPRedirectHandler):
	205	"""YoutubeDL redirect handler
	206
	207	The code is based on HTTPRedirectHandler implementation from CPython [1].
	208
	209	This redirect handler fixes and improves the logic to better align with RFC7261
	210	and what browsers tend to do [2][3]
	211
	212	1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
	213	2. https://datatracker.ietf.org/doc/html/rfc7231
	214	3. https://github.com/python/cpython/issues/91306
	215	"""
	216
	217	http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
	218
	219	def redirect_request(self, req, fp, code, msg, headers, newurl):
	220	if code not in (301, 302, 303, 307, 308):
	221	raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
	222
	223	new_data = req.data
	224
	225	# Technically the Cookie header should be in unredirected_hdrs,
	226	# however in practice some may set it in normal headers anyway.
	227	# We will remove it here to prevent any leaks.
	228	remove_headers = ['Cookie']
	229
	230	new_method = get_redirect_method(req.get_method(), code)
	231	# only remove payload if method changed (e.g. POST to GET)
	232	if new_method != req.get_method():
	233	new_data = None
	234	remove_headers.extend(['Content-Length', 'Content-Type'])
	235
	236	new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
	237
	238	return urllib.request.Request(
	239	newurl, headers=new_headers, origin_req_host=req.origin_req_host,
	240	unverifiable=True, method=new_method, data=new_data)
	241
	242
	243	class ProxyHandler(urllib.request.BaseHandler):
	244	handler_order = 100
	245
	246	def __init__(self, proxies=None):
	247	self.proxies = proxies
	248	# Set default handlers
	249	for type in ('http', 'https', 'ftp'):
	250	setattr(self, '%s_open' % type, lambda r, meth=self.proxy_open: meth(r))
	251
	252	def proxy_open(self, req):
	253	proxy = select_proxy(req.get_full_url(), self.proxies)
	254	if proxy is None:
	255	return
	256	if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
	257	req.add_header('Ytdl-socks-proxy', proxy)
	258	# yt-dlp's http/https handlers do wrapping the socket with socks
	259	return None
	260	return urllib.request.ProxyHandler.proxy_open(
	261	self, req, proxy, None)
	262
	263
	264	class PUTRequest(urllib.request.Request):
	265	def get_method(self):
	266	return 'PUT'
	267
	268
	269	class HEADRequest(urllib.request.Request):
	270	def get_method(self):
	271	return 'HEAD'
	272
	273
	274	def update_Request(req, url=None, data=None, headers=None, query=None):
	275	req_headers = req.headers.copy()
	276	req_headers.update(headers or {})
	277	req_data = data if data is not None else req.data
	278	req_url = update_url_query(url or req.get_full_url(), query)
	279	req_get_method = req.get_method()
	280	if req_get_method == 'HEAD':
	281	req_type = HEADRequest
	282	elif req_get_method == 'PUT':
	283	req_type = PUTRequest
	284	else:
	285	req_type = urllib.request.Request
	286	new_req = req_type(
	287	req_url, data=req_data, headers=req_headers,
	288	origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
	289	if hasattr(req, 'timeout'):
	290	new_req.timeout = req.timeout
	291	return new_req
	292
	293
	294	class UrllibResponseAdapter(Response):
	295	"""
	296	HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
	297	"""
	298
	299	def __init__(self, res: http.client.HTTPResponse \| urllib.response.addinfourl):
	300	# addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
	301	# HTTPResponse: .getcode() was deprecated, .status always existed [2]
	302	# 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
	303	# 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
	304	super().__init__(
	305	fp=res, headers=res.headers, url=res.url,
	306	status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
	307
	308	def read(self, amt=None):
	309	try:
	310	return self.fp.read(amt)
	311	except Exception as e:
	312	handle_response_read_exceptions(e)
	313	raise e
	314
	315
	316	def handle_sslerror(e: ssl.SSLError):
	317	if not isinstance(e, ssl.SSLError):
	318	return
	319	if isinstance(e, ssl.SSLCertVerificationError):
	320	raise CertificateVerifyError(cause=e) from e
	321	raise SSLError(cause=e) from e
	322
	323
	324	def handle_response_read_exceptions(e):
	325	if isinstance(e, http.client.IncompleteRead):
	326	raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
	327	elif isinstance(e, ssl.SSLError):
	328	handle_sslerror(e)
	329	elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
	330	# OSErrors raised here should mostly be network related
	331	raise TransportError(cause=e) from e
	332
	333
	334	@register_rh
	335	class UrllibRH(RequestHandler, InstanceStoreMixin):
	336	_SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
	337	_SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
	338	_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
	339	RH_NAME = 'urllib'
	340
	341	def __init__(self, , enable_file_urls: bool = False, *kwargs):
	342	super().__init__(**kwargs)
	343	self.enable_file_urls = enable_file_urls
	344	if self.enable_file_urls:
	345	self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
	346
	347	def _check_extensions(self, extensions):
	348	super()._check_extensions(extensions)
	349	extensions.pop('cookiejar', None)
	350	extensions.pop('timeout', None)
	351
	352	def _create_instance(self, proxies, cookiejar):
	353	opener = urllib.request.OpenerDirector()
	354	handlers = [
	355	ProxyHandler(proxies),
	356	HTTPHandler(
	357	debuglevel=int(bool(self.verbose)),
	358	context=self._make_sslcontext(),
	359	source_address=self.source_address),
	360	HTTPCookieProcessor(cookiejar),
	361	DataHandler(),
	362	UnknownHandler(),
	363	HTTPDefaultErrorHandler(),
	364	FTPHandler(),
	365	HTTPErrorProcessor(),
	366	RedirectHandler(),
	367	]
	368
	369	if self.enable_file_urls:
	370	handlers.append(FileHandler())
	371
	372	for handler in handlers:
	373	opener.add_handler(handler)
	374
	375	# Delete the default user-agent header, which would otherwise apply in
	376	# cases where our custom HTTP handler doesn't come into play
	377	# (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
	378	opener.addheaders = []
	379	return opener
	380
	381	def _send(self, request):
	382	headers = self._merge_headers(request.headers)
	383	add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
	384	urllib_req = urllib.request.Request(
	385	url=request.url,
	386	data=request.data,
	387	headers=dict(headers),
	388	method=request.method
	389	)
	390
	391	opener = self._get_instance(
	392	proxies=self._get_proxies(request),
	393	cookiejar=self._get_cookiejar(request)
	394	)
	395	try:
	396	res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
	397	except urllib.error.HTTPError as e:
	398	if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
	399	# Prevent file object from being closed when urllib.error.HTTPError is destroyed.
	400	e._closer.close_called = True
	401	raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
	402	raise # unexpected
	403	except urllib.error.URLError as e:
	404	cause = e.reason # NOTE: cause may be a string
	405
	406	# proxy errors
	407	if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
	408	raise ProxyError(cause=e) from e
	409
	410	handle_response_read_exceptions(cause)
	411	raise TransportError(cause=e) from e
	412	except (http.client.InvalidURL, ValueError) as e:
	413	# Validation errors
	414	# http.client.HTTPConnection raises ValueError in some validation cases
	415	# such as if request method contains illegal control characters [1]
	416	# 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
	417	raise RequestError(cause=e) from e
	418	except Exception as e:
	419	handle_response_read_exceptions(e)
	420	raise # unexpected
	421
	422	return UrllibResponseAdapter(res)