]>
jfr.im git - dlqueue.git/blob - venv/lib/python3.11/site-packages/pip/_internal/index/collector.py
2 The main purpose of this module is to expose LinkCollector.collect_sources().
14 from html
.parser
import HTMLParser
15 from optparse
import Values
30 from pip
._vendor
import requests
31 from pip
._vendor
.requests
import Response
32 from pip
._vendor
.requests
.exceptions
import RetryError
, SSLError
34 from pip
._internal
.exceptions
import NetworkConnectionError
35 from pip
._internal
.models
.link
import Link
36 from pip
._internal
.models
.search_scope
import SearchScope
37 from pip
._internal
.network
.session
import PipSession
38 from pip
._internal
.network
.utils
import raise_for_status
39 from pip
._internal
.utils
.filetypes
import is_archive_file
40 from pip
._internal
.utils
.misc
import redact_auth_from_url
41 from pip
._internal
.vcs
import vcs
43 from .sources
import CandidatesFromPage
, LinkSource
, build_source
46 from typing
import Protocol
50 logger
= logging
.getLogger(__name__
)
52 ResponseHeaders
= MutableMapping
[str, str]
55 def _match_vcs_scheme(url
: str) -> Optional
[str]:
56 """Look for VCS schemes in the URL.
58 Returns the matched VCS scheme, or None if there's no match.
60 for scheme
in vcs
.schemes
:
61 if url
.lower().startswith(scheme
) and url
[len(scheme
)] in "+:":
66 class _NotAPIContent(Exception):
67 def __init__(self
, content_type
: str, request_desc
: str) -> None:
68 super().__init
__(content_type
, request_desc
)
69 self
.content_type
= content_type
70 self
.request_desc
= request_desc
73 def _ensure_api_header(response
: Response
) -> None:
75 Check the Content-Type header to ensure the response contains a Simple
78 Raises `_NotAPIContent` if the content type is not a valid content-type.
80 content_type
= response
.headers
.get("Content-Type", "Unknown")
82 content_type_l
= content_type
.lower()
83 if content_type_l
.startswith(
86 "application/vnd.pypi.simple.v1+html",
87 "application/vnd.pypi.simple.v1+json",
92 raise _NotAPIContent(content_type
, response
.request
.method
)
95 class _NotHTTP(Exception):
99 def _ensure_api_response(url
: str, session
: PipSession
) -> None:
101 Send a HEAD request to the URL, and ensure the response contains a simple
104 Raises `_NotHTTP` if the URL is not available for a HEAD request, or
105 `_NotAPIContent` if the content type is not a valid content type.
107 scheme
, netloc
, path
, query
, fragment
= urllib
.parse
.urlsplit(url
)
108 if scheme
not in {"http", "https"}
:
111 resp
= session
.head(url
, allow_redirects
=True)
112 raise_for_status(resp
)
114 _ensure_api_header(resp
)
117 def _get_simple_response(url
: str, session
: PipSession
) -> Response
:
118 """Access an Simple API response with GET, and return the response.
120 This consists of three parts:
122 1. If the URL looks suspiciously like an archive, send a HEAD first to
123 check the Content-Type is HTML or Simple API, to avoid downloading a
124 large file. Raise `_NotHTTP` if the content type cannot be determined, or
125 `_NotAPIContent` if it is not HTML or a Simple API.
126 2. Actually perform the request. Raise HTTP exceptions on network failures.
127 3. Check the Content-Type header to make sure we got a Simple API response,
128 and raise `_NotAPIContent` otherwise.
130 if is_archive_file(Link(url
).filename
):
131 _ensure_api_response(url
, session
=session
)
133 logger
.debug("Getting page %s", redact_auth_from_url(url
))
140 "application/vnd.pypi.simple.v1+json",
141 "application/vnd.pypi.simple.v1+html; q=0.1",
145 # We don't want to blindly returned cached data for
146 # /simple/, because authors generally expecting that
147 # twine upload && pip install will function, but if
148 # they've done a pip install in the last ~10 minutes
149 # it won't. Thus by setting this to zero we will not
150 # blindly use any cached data, however the benefit of
151 # using max-age=0 instead of no-cache, is that we will
152 # still support conditional requests, so we will still
153 # minimize traffic sent in cases where the page hasn't
154 # changed at all, we will just always incur the round
155 # trip for the conditional GET now instead of only
156 # once per 10 minutes.
157 # For more information, please see pypa/pip#5670.
158 "Cache-Control": "max-age=0",
161 raise_for_status(resp
)
163 # The check for archives above only works if the url ends with
164 # something that looks like an archive. However that is not a
165 # requirement of an url. Unless we issue a HEAD request on every
166 # url we cannot know ahead of time for sure if something is a
167 # Simple API response or not. However we can check after we've
169 _ensure_api_header(resp
)
172 "Fetched page %s as %s",
173 redact_auth_from_url(url
),
174 resp
.headers
.get("Content-Type", "Unknown"),
180 def _get_encoding_from_headers(headers
: ResponseHeaders
) -> Optional
[str]:
181 """Determine if we have any encoding information in our headers."""
182 if headers
and "Content-Type" in headers
:
183 m
= email
.message
.Message()
184 m
["content-type"] = headers
["Content-Type"]
185 charset
= m
.get_param("charset")
191 class CacheablePageContent
:
192 def __init__(self
, page
: "IndexContent") -> None:
193 assert page
.cache_link_parsing
196 def __eq__(self
, other
: object) -> bool:
197 return isinstance(other
, type(self
)) and self
.page
.url
== other
.page
.url
199 def __hash__(self
) -> int:
200 return hash(self
.page
.url
)
203 class ParseLinks(Protocol
):
204 def __call__(self
, page
: "IndexContent") -> Iterable
[Link
]:
208 def with_cached_index_content(fn
: ParseLinks
) -> ParseLinks
:
210 Given a function that parses an Iterable[Link] from an IndexContent, cache the
211 function's result (keyed by CacheablePageContent), unless the IndexContent
212 `page` has `page.cache_link_parsing == False`.
215 @functools.lru_cache(maxsize
=None)
216 def wrapper(cacheable_page
: CacheablePageContent
) -> List
[Link
]:
217 return list(fn(cacheable_page
.page
))
220 def wrapper_wrapper(page
: "IndexContent") -> List
[Link
]:
221 if page
.cache_link_parsing
:
222 return wrapper(CacheablePageContent(page
))
223 return list(fn(page
))
225 return wrapper_wrapper
228 @with_cached_index_content
229 def parse_links(page
: "IndexContent") -> Iterable
[Link
]:
231 Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
234 content_type_l
= page
.content_type
.lower()
235 if content_type_l
.startswith("application/vnd.pypi.simple.v1+json"):
236 data
= json
.loads(page
.content
)
237 for file in data
.get("files", []):
238 link
= Link
.from_json(file, page
.url
)
244 parser
= HTMLLinkParser(page
.url
)
245 encoding
= page
.encoding
or "utf-8"
246 parser
.feed(page
.content
.decode(encoding
))
249 base_url
= parser
.base_url
or url
250 for anchor
in parser
.anchors
:
251 link
= Link
.from_element(anchor
, page_url
=url
, base_url
=base_url
)
258 """Represents one response (or page), along with its URL"""
264 encoding
: Optional
[str],
266 cache_link_parsing
: bool = True,
269 :param encoding: the encoding to decode the given content.
270 :param url: the URL from which the HTML was downloaded.
271 :param cache_link_parsing: whether links parsed from this page's url
272 should be cached. PyPI index urls should
273 have this set to False, for example.
275 self
.content
= content
276 self
.content_type
= content_type
277 self
.encoding
= encoding
279 self
.cache_link_parsing
= cache_link_parsing
281 def __str__(self
) -> str:
282 return redact_auth_from_url(self
.url
)
285 class HTMLLinkParser(HTMLParser
):
287 HTMLParser that keeps the first base HREF and a list of all anchor
288 elements' attributes.
291 def __init__(self
, url
: str) -> None:
292 super().__init
__(convert_charrefs
=True)
295 self
.base_url
: Optional
[str] = None
296 self
.anchors
: List
[Dict
[str, Optional
[str]]] = []
298 def handle_starttag(self
, tag
: str, attrs
: List
[Tuple
[str, Optional
[str]]]) -> None:
299 if tag
== "base" and self
.base_url
is None:
300 href
= self
.get_href(attrs
)
304 self
.anchors
.append(dict(attrs
))
306 def get_href(self
, attrs
: List
[Tuple
[str, Optional
[str]]]) -> Optional
[str]:
307 for name
, value
in attrs
:
313 def _handle_get_simple_fail(
315 reason
: Union
[str, Exception],
316 meth
: Optional
[Callable
[..., None]] = None,
320 meth("Could not fetch URL %s: %s - skipping", link
, reason
)
323 def _make_index_content(
324 response
: Response
, cache_link_parsing
: bool = True
326 encoding
= _get_encoding_from_headers(response
.headers
)
329 response
.headers
["Content-Type"],
332 cache_link_parsing
=cache_link_parsing
,
336 def _get_index_content(link
: Link
, *, session
: PipSession
) -> Optional
["IndexContent"]:
337 url
= link
.url
.split("#", 1)[0]
339 # Check for VCS schemes that do not support lookup as web pages.
340 vcs_scheme
= _match_vcs_scheme(url
)
343 "Cannot look at %s URL %s because it does not support lookup as web pages.",
349 # Tack index.html onto file:// URLs that point to directories
350 scheme
, _
, path
, _
, _
, _
= urllib
.parse
.urlparse(url
)
351 if scheme
== "file" and os
.path
.isdir(urllib
.request
.url2pathname(path
)):
352 # add trailing slash if not present so urljoin doesn't trim
354 if not url
.endswith("/"):
356 # TODO: In the future, it would be nice if pip supported PEP 691
357 # style responses in the file:// URLs, however there's no
358 # standard file extension for application/vnd.pypi.simple.v1+json
359 # so we'll need to come up with something on our own.
360 url
= urllib
.parse
.urljoin(url
, "index.html")
361 logger
.debug(" file: URL is directory, getting %s", url
)
364 resp
= _get_simple_response(url
, session
=session
)
367 "Skipping page %s because it looks like an archive, and cannot "
368 "be checked by a HTTP HEAD request.",
371 except _NotAPIContent
as exc
:
373 "Skipping page %s because the %s request got Content-Type: %s. "
374 "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
375 "application/vnd.pypi.simple.v1+html, and text/html",
380 except NetworkConnectionError
as exc
:
381 _handle_get_simple_fail(link
, exc
)
382 except RetryError
as exc
:
383 _handle_get_simple_fail(link
, exc
)
384 except SSLError
as exc
:
385 reason
= "There was a problem confirming the ssl certificate: "
387 _handle_get_simple_fail(link
, reason
, meth
=logger
.info
)
388 except requests
.ConnectionError
as exc
:
389 _handle_get_simple_fail(link
, f
"connection error: {exc}")
390 except requests
.Timeout
:
391 _handle_get_simple_fail(link
, "timed out")
393 return _make_index_content(resp
, cache_link_parsing
=link
.cache_link_parsing
)
397 class CollectedSources(NamedTuple
):
398 find_links
: Sequence
[Optional
[LinkSource
]]
399 index_urls
: Sequence
[Optional
[LinkSource
]]
405 Responsible for collecting Link objects from all configured locations,
406 making network requests as needed.
408 The class's main method is its collect_sources() method.
414 search_scope
: SearchScope
,
416 self
.search_scope
= search_scope
417 self
.session
= session
424 suppress_no_index
: bool = False,
425 ) -> "LinkCollector":
427 :param session: The Session to use to make requests.
428 :param suppress_no_index: Whether to ignore the --no-index option
429 when constructing the SearchScope object.
431 index_urls
= [options
.index_url
] + options
.extra_index_urls
432 if options
.no_index
and not suppress_no_index
:
434 "Ignoring indexes: %s",
435 ",".join(redact_auth_from_url(url
) for url
in index_urls
),
439 # Make sure find_links is a list before passing to create().
440 find_links
= options
.find_links
or []
442 search_scope
= SearchScope
.create(
443 find_links
=find_links
,
444 index_urls
=index_urls
,
445 no_index
=options
.no_index
,
447 link_collector
= LinkCollector(
449 search_scope
=search_scope
,
451 return link_collector
454 def find_links(self
) -> List
[str]:
455 return self
.search_scope
.find_links
457 def fetch_response(self
, location
: Link
) -> Optional
[IndexContent
]:
459 Fetch an HTML page containing package links.
461 return _get_index_content(location
, session
=self
.session
)
466 candidates_from_page
: CandidatesFromPage
,
467 ) -> CollectedSources
:
468 # The OrderedDict calls deduplicate sources by URL.
469 index_url_sources
= collections
.OrderedDict(
472 candidates_from_page
=candidates_from_page
,
473 page_validator
=self
.session
.is_secure_origin
,
475 cache_link_parsing
=False,
477 for loc
in self
.search_scope
.get_index_urls_locations(project_name
)
479 find_links_sources
= collections
.OrderedDict(
482 candidates_from_page
=candidates_from_page
,
483 page_validator
=self
.session
.is_secure_origin
,
485 cache_link_parsing
=True,
487 for loc
in self
.find_links
490 if logger
.isEnabledFor(logging
.DEBUG
):
493 for s
in itertools
.chain(find_links_sources
, index_url_sources
)
494 if s
is not None and s
.link
is not None
497 f
"{len(lines)} location(s) to search "
498 f
"for versions of {project_name}:"
500 logger
.debug("\n".join(lines
))
502 return CollectedSources(
503 find_links
=list(find_links_sources
),
504 index_urls
=list(index_url_sources
),