diff --git a/news/11784.feature.rst b/news/11784.feature.rst new file mode 100644 index 00000000000..6d9c77d23d8 --- /dev/null +++ b/news/11784.feature.rst @@ -0,0 +1 @@ +Introduce repository alternate locations and project tracking, as per PEP 708. diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 8ceb818a35d..8aac5da072e 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -13,7 +13,7 @@ import re import sys from itertools import chain, groupby, repeat -from typing import TYPE_CHECKING, Dict, Iterator, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Dict, Iterator, List, Literal, Optional, Set, Union from pip._vendor.rich.console import Console, ConsoleOptions, RenderResult from pip._vendor.rich.markup import escape @@ -775,3 +775,134 @@ def __init__(self, *, distribution: "BaseDistribution") -> None: ), hint_stmt=None, ) + + +class InvalidMultipleRemoteRepositories(DiagnosticPipError): + """Common error for issues with multiple remote repositories.""" + + reference = "invalid-multiple-remote-repositories" + _note_suffix = ( + "See PEP 708 for the specification. " + "You can override this check, which will disable the security " + "protection it provides from dependency confusion attacks, " + "by passing --insecure-multiple-remote-repositories." + ) + + +class InvalidTracksUrl(InvalidMultipleRemoteRepositories): + """There was an issue with a Tracks metadata url. + + Tracks urls must point to the actual URLs for that project, + point to the repositories that own the namespaces, and + point to a project with the exact same name (after normalization). + """ + + reference = "invalid-tracks-url" + + def __init__( + self, + *, + package: str, + remote_repositories: Set[str], + invalid_tracks: Set[str], + ) -> None: + super().__init__( + kind="error", + message=Text( + f"One or more Tracks for {escape(package)} " + "were not valid. " + "The remote repositories are " + f"{'; '.join(sorted(escape(r) for r in remote_repositories))}." + "The invalid tracks are " + f"{'; '.join(sorted(escape(r) for r in invalid_tracks))}." + ), + context=Text( + "Tracks urls must point to the actual URLs for a project, " + "point to the repositories that own the namespaces, and " + "point to a project with the exact same normalized name." + ), + hint_stmt=None, + note_stmt=Text( + "The way to resolve this error is to contact the owners of " + "each remote repository, and ask if it makes sense to " + "configure them to merge namespaces. " + self._note_suffix + ), + ) + + +class InvalidAlternativeLocationsUrl(InvalidMultipleRemoteRepositories): + """The list of Alternate Locations for each repository do not match. + + In order for this metadata to be trusted, there MUST be agreement between + all locations where that project is found as to what the alternate locations are. + """ + + reference = "invalid-alternative-locations" + + def __init__( + self, + *, + package: str, + remote_repositories: Set[str], + invalid_locations: Set[str], + ) -> None: + super().__init__( + kind="error", + message=Text( + f"One or more Alternate Locations for {escape(package)} " + "were different among the remote repositories. " + "The remote repositories are " + f"{'; '.join(sorted(escape(r) for r in remote_repositories))}." + "The alternate locations not agreed by all remote " + "repository are " + f"{'; '.join(sorted(escape(r) for r in invalid_locations))}." + ), + context=Text( + "To be able to trust the remote repository Alternate Locations, " + "all remote repositories must agree on the list of Locations." + ), + hint_stmt=None, + note_stmt=Text( + "The way to resolve this error is to contact the owners of the package " + "at each remote repository, and ask if it makes sense to " + "configure them to merge namespaces. " + self._note_suffix + ), + ) + + +class UnsafeMultipleRemoteRepositories(InvalidMultipleRemoteRepositories): + """More than one remote repository was provided for a package, + with no indication that the remote repositories can be safely merged. + + The repositories, packages, or user did not indicate that + it is safe to merge remote repositories. + + Multiple remote repositories are not merged by default + to reduce the risk of dependency confusion attacks.""" + + reference = "unsafe-multiple-remote-repositories" + + def __init__(self, *, package: str, remote_repositories: Set[str]) -> None: + super().__init__( + kind="error", + message=Text( + f"More than one remote repository was found for {escape(package)}, " + "with no indication that the remote repositories can be safely merged. " + "The repositories are " + f"{'; '.join(sorted(escape(r) for r in remote_repositories))}." + ), + context=Text( + "Multiple remote repositories are not merged by default " + "to reduce the risk of dependency confusion attacks." + ), + hint_stmt=Text( + "Remote repositories can be specified or discovered using " + "--index-url, --extra-index-url, and --find-links. " + "Please check the pip command to see if these are in use." + ), + note_stmt=Text( + "The way to resolve this error is to contact the remote repositories " + "and package owners, and ask if it makes sense to configure them to " + "merge namespaces. " + self._note_suffix + ), + ) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 5f8fdee3d46..8a31d442450 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -24,6 +24,7 @@ Optional, Protocol, Sequence, + Set, Tuple, Union, ) @@ -33,7 +34,12 @@ from pip._vendor.requests.exceptions import RetryError, SSLError from pip._internal.exceptions import NetworkConnectionError -from pip._internal.models.link import Link +from pip._internal.models.link import ( + HEAD_META_ALTERNATE_LOCATIONS, + HEAD_META_PREFIX, + HEAD_META_TRACKS, + Link, +) from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession from pip._internal.network.utils import raise_for_status @@ -224,13 +230,22 @@ def wrapper_wrapper(page: "IndexContent") -> List[Link]: def parse_links(page: "IndexContent") -> Iterable[Link]: """ Parse a Simple API's Index Content, and yield its anchor elements as Link objects. + Includes known metadata from the HTML header. """ - + url = page.url content_type_l = page.content_type.lower() if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): data = json.loads(page.content) + project_track_urls = set(data.get("meta", {}).get("tracks", [])) + repo_alt_urls = set(data.get("alternate-locations", [])) + repo_alt_urls.add(page.url) for file in data.get("files", []): - link = Link.from_json(file, page.url) + link = Link.from_json( + file, + page_url=page.url, + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, + ) if link is None: continue yield link @@ -240,10 +255,17 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: encoding = page.encoding or "utf-8" parser.feed(page.content.decode(encoding)) - url = page.url base_url = parser.base_url or url for anchor in parser.anchors: - link = Link.from_element(anchor, page_url=url, base_url=base_url) + repo_alt_urls = parser.repo_alt_urls or set() + repo_alt_urls.add(page.url) + link = Link.from_element( + anchor, + page_url=url, + base_url=base_url, + project_track_urls=parser.project_track_urls, + repo_alt_urls=repo_alt_urls, + ) if link is None: continue yield link @@ -282,6 +304,8 @@ def __init__(self, url: str) -> None: self.url: str = url self.base_url: Optional[str] = None self.anchors: List[Dict[str, Optional[str]]] = [] + self.project_track_urls: Set[str] = set() + self.repo_alt_urls: Set[str] = set() def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: if tag == "base" and self.base_url is None: @@ -290,6 +314,21 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N self.base_url = href elif tag == "a": self.anchors.append(dict(attrs)) + elif tag == "meta": + meta_attrs = dict(attrs) + meta_key = meta_attrs.get("name", "").strip() + meta_val = meta_attrs.get("content", "").strip() + if meta_key and meta_val: + if ( + meta_key == self._meta_key_tracks + and meta_val not in self.project_track_urls + ): + self.project_track_urls.add(meta_val) + elif ( + meta_key == self._meta_key_alternate_locations + and meta_val not in self.repo_alt_urls + ): + self.repo_alt_urls.add(meta_val) def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: for name, value in attrs: @@ -297,6 +336,14 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: return value return None + @functools.cached_property + def _meta_key_tracks(self) -> str: + return f"{HEAD_META_PREFIX}:{HEAD_META_TRACKS}" + + @functools.cached_property + def _meta_key_alternate_locations(self) -> str: + return f"{HEAD_META_PREFIX}:{HEAD_META_ALTERNATE_LOCATIONS}" + def _handle_get_simple_fail( link: Link, diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 0d65ce35f37..fb280453cab 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -4,7 +4,9 @@ import functools import itertools import logging +import pathlib import re +import urllib.parse from dataclasses import dataclass from typing import TYPE_CHECKING, FrozenSet, Iterable, List, Optional, Set, Tuple, Union @@ -17,11 +19,15 @@ from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, + InvalidAlternativeLocationsUrl, InvalidWheelFilename, + InvalidTracksUrl, + UnsafeMultipleRemoteRepositories, UnsupportedWheel, ) from pip._internal.index.collector import LinkCollector, parse_links -from pip._internal.models.candidate import InstallationCandidate +from pip._internal.models.candidate import InstallationCandidate, \ + RemoteInstallationCandidate from pip._internal.models.format_control import FormatControl from pip._internal.models.link import Link from pip._internal.models.search_scope import SearchScope @@ -475,6 +481,11 @@ def get_applicable_candidates( project_name=self._project_name, ) + check_multiple_remote_repositories( + candidates=filtered_applicable_candidates, + project_name=self._project_name, + ) + return sorted(filtered_applicable_candidates, key=self._sort_key) def _sort_key(self, candidate: InstallationCandidate) -> CandidateSortingKey: @@ -1018,3 +1029,268 @@ def _extract_version_from_fragment(fragment: str, canonical_name: str) -> Option if not version: return None return version + + +def check_multiple_remote_repositories( + candidates: List[InstallationCandidate], project_name: str +) -> None: + """ + Check whether two or more different namespaces can be flattened into one. + + This check will raise an error when packages from different sources will be merged, + without clear configuration to indicate the namespace merge is allowed. + See `PEP 708`_. + + This approach allows safely merging separate namespaces that are actually one + logical namespace, while enforcing a more secure default. + + Returns None if checks pass, otherwise will raise an error with details of the + first failed check. + """ + + # NOTE: The checks in this function must occur after: + # Specification: Filter out any files that do not match known hashes from a + # lockfile or requirements file. + + # NOTE: The checks in this function must occur before: + # Specification: Filter out any files that do not match the current platform, + # Python version, etc. We check for the metadata in this PEP before filtering out + # based on platform, Python version, etc., because we don’t want errors that only + # show up on certain platforms, Python versions, etc. + + # NOTE: Implemented in 'filter_unallowed_hashes': + # Specification: Users who are using lock files or requirements files that include + # specific hashes of artifacts that are “valid” are assumed to be protected by + # nature of those hashes, since the rest of these recommendations would apply + # during hash generation. Thus, we filter out unknown hashes up front. + + # NOTE: Implemented in 'collector.py' 'parse_links': + # Specification: When using alternate locations, clients MUST implicitly assume + # that the url the response was fetched from was included in the list. + + # NOTE: Implemented in 'collector.py' 'parse_links': + # Specification: Order of the elements within the array does not have any + # particular meaning. + + # NOTE: Implemented by this function, by not checking all repo tracks metadata is + # the exact same. + # Specification: Mixed use repositories where some names track a repository and + # some names do not are explicitly allowed. + + # TODO: This requirement doesn't look like something that pip can do anything about. + # How would pip be able to tell whether two urls are the 'same project'? + # Specification: All [Tracks metadata] URLs MUST represent the same “project” as + # the project in the extending repository. This does not mean that they need to + # serve the same files. It is valid for them to include binaries built on + # different platforms, copies with local patches being applied, etc. This is + # purposefully left vague as it’s ultimately up to the expectations that the + # users have of the repository and its operators what exactly constitutes + # the “same” project. + + # TODO: This requirement doesn't look like something that pip can do anything about. + # Tracks metadata is not required, so anything that uses tracks meta + # must deal with it not being available. + # Specification: It [Tracks metadata] is NOT required that every name in a + # repository tracks the same repository, or that they all track a repository at all. + + # TODO: This requirement doesn't look like something that pip can do anything about. + # How would pip be able to tell whether the metadata is under the control of + # the repository operators and not any individual publisher using the repository? + # Specification: It [repository Tracks metadata] MUST be under the control of the + # repository operators themselves, not any individual publisher using that + # repository. “Repository Operator” can also include anyone who managed the overall + # namespace for a particular repository, which may be the case in situations like + # hosted repository services where one entity operates the software but another + # owns/manages the entire namespace of that repository. + + # TODO: Consider making requests to repositories revealed via Alternate Locations + # or Tracks metadata, to assess the metadata of those additional repositories. + # This would be a fair bit more functionality to include. + # Maybe a subsequent PR, if this functionality is desirable? + # Specification: When an installer encounters a project that is using the + # alternate locations metadata it SHOULD consider that all repositories named are + # extending the same namespace across multiple repositories. + # Implementation: It is not possible to enforce this requirement without access + # to the metadata for all of the remote repositories, as the Tracks metadata + # might point at a repository that does not exist, or at a repository that is + # Tracking another repository. + + # TODO: Consider whether pip already has, or should add, ways to indicate exactly + # which individual projects to get from exactly which repositories. + # This seems to be separate functionality, that could be added in a separate PR. + # Specification: If the user has explicitly told the installer that it wants to + # fetch a project from a certain set of repositories, then there is no reason + # to question that and we assume that they’ve made sure it is safe to merge + # those namespaces. If the end user has explicitly told the installer to fetch + # the project from specific repositories, filter out all other repositories. + + # When no candidates are provided, then no checks are relevant, so just return. + if candidates is None or len(candidates) == 0: + logger.debug("No candidates given to multiple remote repository checks") + return None + + # Calculate the canonical name for later comparisons. + canonical_name = canonicalize_name(project_name) + + # Specification: Look to see if the discovered files span multiple repositories; + # if they do then determine if either “Tracks” or “Alternate Locations” metadata + # allows safely merging together ALL the repositories where files were discovered. + remote_candidates = [] + # all known remote repositories + known_remote_repo_urls = set() + # all known alternate location urls + known_alternate_urls = set() + # the alternate location urls that are not present in all remote candidates + mismatch_alternate_urls = set() + # all known remote repositories that do not track any other repos + known_owner_repo_urls = set() + # all known remote repositories that do track other repos + known_tracker_repo_urls = set() + for candidate in candidates: + candidate_name = candidate.name + link = candidate.link + + # Specification: Repositories that exist on the local filesystem SHOULD always + # be implicitly allowed to be merged to any remote repository. + if link.is_local_only: + # Ignore any local candidates in later comparisons. + logger.debug( + "Ignoring local candidate %s in multiple remote repository checks", + candidate, + ) + continue + + remote_candidate = RemoteInstallationCandidate( + canonical_name=canonicalize_name(candidate_name), + candidate=candidate, + ) + remote_candidates.append(remote_candidate) + + # populate the known urls + known_remote_repo_urls.update(remote_candidate.remote_repository_urls) + known_alternate_urls.update(remote_candidate.alternate_location_urls) + if len(remote_candidate.project_track_urls) > 0: + known_tracker_repo_urls.update(remote_candidate.url) + else: + known_owner_repo_urls.update(remote_candidate.url) + + # Update the set, keeping only elements found in either set, but not in both. + # This should allow all the items that are not present for all remote candidates + # to be tracked. + mismatch_alternate_urls.symmetric_difference_update( + remote_candidate.alternate_location_urls + ) + + # If there are no remote candidates, then allow merging repositories. + if len(remote_candidates) == 0: + logger.debug("No remote candidates for multiple remote repository checks") + return None + + # Specification: If the project in question only comes from a single repository, + # then there is no chance of dependency confusion, so there’s no reason to do + # anything but allow. + if len(known_remote_repo_urls) == 1: + logger.debug( + "No chance for dependency confusion when there is only " + "one remote candidate for multiple remote repository checks" + ) + return None + if len(known_remote_repo_urls) == 0: + msg = ("Unexpected situation where there are remote candidates, " + "but no remote repositories") + logger.warning(msg) + raise ValueError(msg) + + # This checks the list of Alternate Locations for the candidates that were + # retrieved. Each remote candidate may define a different list of Tracks and + # Alternate Location metadata. + # TODO: The Alternate Locations and Tracks urls revealed by the candidate metadata + # are not requested to check that their metadata matches the candidate metadata. + # This means that the known candidate metadata might agree and pass this check, + # while retrieving the metadata from additional urls would not agree and would + # fail this check. + # Specification: In order for this metadata to be trusted, there MUST be agreement + # between all locations where that project is found as to what the alternate + # locations are. + if len(mismatch_alternate_urls) > 0: + raise InvalidAlternativeLocationsUrl( + package=project_name, + remote_repositories=known_remote_repo_urls, + invalid_locations=mismatch_alternate_urls, + ) + + # Check the Tracks metadata. + for remote_candidate in remote_candidates: + project_track_urls = remote_candidate.project_track_urls + page_url = remote_candidate.url + for project_track_url in project_track_urls: + parts = pathlib.Path(urllib.parse.urlsplit(project_track_url).path).parts + parts = list(parts) + parts.reverse() + + # Specification: It [Tracks metadata] MUST point to the actual URLs + # for that project, not the base URL for the extended repositories. + # Implementation Note: Assume that the actual url for the project will + # contain the canonical project name as a path part, and the base URL + # will not contain the canonical project name as a path part. + + # Specification: It [Tracks metadata] MUST point to a project with + # the exact same normalized name. + # Implementation Note: Assume that projects that are configured to be the + # 'exact same' will have a Tracks url path part with the canonical + # project name, usually the last path part. + + # Note: These assumptions about the structure of the Tracks url may not + # hold true for all remote repositories. + if not parts or not any( + canonical_name == canonicalize_name(p) for p in parts + ): + raise InvalidTracksUrl( + package=project_name, + remote_repositories={page_url}, + invalid_tracks={project_track_url}, + ) + + # Specification: It [Tracks metadata] MUST point to the repositories + # that “own” the namespaces, not another repository that is also + # tracking that namespace. + # Implementation Note: An 'owner' repository is one that has no Tracks + # metadata. + # TODO: Without requesting all repositories revealed by metadata, this + # check might pass with incomplete knowledge of all metadata, + # when it would fail after retrieving all metadata. + if project_track_url not in known_owner_repo_urls: + raise InvalidTracksUrl( + package=project_name, + remote_repositories={page_url}, + invalid_tracks={project_track_url}, + ) + + # Specification: If nothing tells us merging the namespaces is safe, we refuse to + # implicitly assume it is, and generate an error instead. + # Specification: If that metadata does NOT allow [merging namespaces], then + # generate an error. + # Implementation Note: If there are two or more remote candidates, and any of them + # don't have valid Alternate Locations and/or Tracks metadata, then fail. + for remote_candidate in remote_candidates: + candidate_alt_urls = remote_candidate.alternate_location_urls + + invalid_alt_urls = known_alternate_urls - candidate_alt_urls + has_alts = len(candidate_alt_urls) > 0 + has_tracks = len(remote_candidate.project_track_urls) > 0 + + is_invalid = any([ + not has_alts and not has_tracks, + not has_tracks and invalid_alt_urls, + ]) + + if is_invalid: + error = UnsafeMultipleRemoteRepositories( + package=project_name, + remote_repositories=known_remote_repo_urls, + ) + raise error + + # Specification: Otherwise [if metadata allows] we merge the namespaces, + # and continue on. + return None diff --git a/src/pip/_internal/models/candidate.py b/src/pip/_internal/models/candidate.py index f27f283154a..f53b05f20f7 100644 --- a/src/pip/_internal/models/candidate.py +++ b/src/pip/_internal/models/candidate.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from typing import Set, Optional from pip._vendor.packaging.version import Version from pip._vendor.packaging.version import parse as parse_version @@ -23,3 +24,66 @@ def __init__(self, name: str, version: str, link: Link) -> None: def __str__(self) -> str: return f"{self.name!r} candidate (version {self.version} at {self.link})" + + +@dataclass(frozen=True) +class RemoteInstallationCandidate: + """Represents a potential "candidate" for installation.""" + + __slots__ = ["canonical_name", "candidate"] + + canonical_name: str + candidate: InstallationCandidate + + def __init__(self, canonical_name:str, candidate: InstallationCandidate) -> None: + object.__setattr__(self, "canonical_name", canonical_name) + object.__setattr__(self, "candidate", candidate) + + def __str__(self) -> str: + return f"{self.canonical_name!r} candidate" + + @property + def _link(self) -> Optional[Link]: + if not self.candidate: + return None + if not self.candidate.link: + return None + return self.candidate.link + + @property + def url(self) -> Optional[str]: + """The remote url that contains the metadata for this installation candidate.""" + link = self._link + if not link: + return None + if not link.comes_from: + return None + if hasattr(link.comes_from, 'url') and link.comes_from.url: + return self.candidate.link.comes_from.url.lstrip() + if link.comes_from: + return self.candidate.link.comes_from.lstrip() + return None + + + @property + def remote_repository_urls(self) -> Set[str]: + """Remote repository urls from Tracks and Alternate Locations metadata.""" + return {*self.project_track_urls, *self.alternate_location_urls} + + @property + def project_track_urls(self) -> Set[str]: + """Remote repository urls from Tracks metadata.""" + result = {} + link = self._link + if link and link.project_track_urls: + result.update(link.project_track_urls) + return {i for i in result if i} + + @property + def alternate_location_urls(self) -> Set[str]: + """Remote repository urls from Alternate Locations metadata.""" + result = {self.url} + link = self._link + if link and link.repo_alt_urls: + result.update(link.repo_alt_urls) + return {i for i in result if i} diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2f41f2f6a09..9f008a27c7b 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -14,6 +14,7 @@ Mapping, NamedTuple, Optional, + Set, Tuple, Union, ) @@ -39,6 +40,10 @@ # we will pick to use. _SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5") +HEAD_META_PREFIX = "pypi" +HEAD_META_ALTERNATE_LOCATIONS = "alternate-locations" +HEAD_META_TRACKS = "tracks" + @dataclass(frozen=True) class LinkHash: @@ -192,6 +197,8 @@ class Link: "metadata_file_data", "cache_link_parsing", "egg_fragment", + "project_track_urls", + "repo_alt_urls", ] def __init__( @@ -203,6 +210,8 @@ def __init__( metadata_file_data: Optional[MetadataFile] = None, cache_link_parsing: bool = True, hashes: Optional[Mapping[str, str]] = None, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, ) -> None: """ :param url: url of the resource pointed to (href of the link) @@ -227,6 +236,10 @@ def __init__( URLs should generally have this set to False, for example. :param hashes: A mapping of hash names to digests to allow us to determine the validity of a download. + :param project_track_urls: An optional list of urls pointing to the same + project in other repositories. Defined by the repository operators. + :param repo_alt_urls: An optional list of urls pointing to alternate + locations for the project. Defined by the project owners. """ # The comes_from, requires_python, and metadata_file_data arguments are @@ -257,11 +270,16 @@ def __init__( self.cache_link_parsing = cache_link_parsing self.egg_fragment = self._egg_fragment() + self.project_track_urls = project_track_urls or set() + self.repo_alt_urls = repo_alt_urls or set() + @classmethod def from_json( cls, file_data: Dict[str, Any], page_url: str, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, ) -> Optional["Link"]: """ Convert an pypi json document from a simple repository page into a Link. @@ -299,6 +317,9 @@ def from_json( elif not yanked_reason: yanked_reason = None + project_track_urls = {i.strip() for i in project_track_urls if i and i.strip()} + repo_alt_urls = {i.strip() for i in repo_alt_urls if i and i.strip()} + return cls( url, comes_from=page_url, @@ -306,6 +327,8 @@ def from_json( yanked_reason=yanked_reason, hashes=hashes, metadata_file_data=metadata_file_data, + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, ) @classmethod @@ -314,6 +337,8 @@ def from_element( anchor_attribs: Dict[str, Optional[str]], page_url: str, base_url: str, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, ) -> Optional["Link"]: """ Convert an anchor element's attributes in a simple repository page to a Link. @@ -352,12 +377,17 @@ def from_element( ) metadata_file_data = MetadataFile(None) + project_track_urls = {i.strip() for i in project_track_urls if i and i.strip()} + repo_alt_urls = {i.strip() for i in repo_alt_urls if i and i.strip()} + return cls( url, comes_from=page_url, requires_python=pyrequire, yanked_reason=yanked_reason, metadata_file_data=metadata_file_data, + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, ) def __str__(self) -> str: @@ -526,6 +556,28 @@ def is_hash_allowed(self, hashes: Optional[Hashes]) -> bool: return False return any(hashes.is_hash_allowed(k, v) for k, v in self._hashes.items()) + @property + def is_local_only(self): + """ + Is this link entirely local, with no metadata pointing to a remote url? + """ + logger.debug( + { + "is_file": self.is_file, + "file_path": self.file_path if self.is_file else None, + "comes_from": self.comes_from, + "metadata_urls": {*self.project_track_urls, *self.repo_alt_urls}, + } + ) + return ( + (self.is_file and self.file_path) + and not self.comes_from + and not any( + not i.startswith("file:") + for i in {*self.project_track_urls, *self.repo_alt_urls} + ) + ) + class _CleanResult(NamedTuple): """Convert link for equivalency check. diff --git a/tests/data/indexes/repository-alternate-01/simple/index.html b/tests/data/indexes/repository-alternate-01/simple/index.html new file mode 100644 index 00000000000..e9174877f7d --- /dev/null +++ b/tests/data/indexes/repository-alternate-01/simple/index.html @@ -0,0 +1,11 @@ + + + + + + + + +simple-1.0.tar.gz + + diff --git a/tests/data/indexes/repository-alternate-02/simple/index.html b/tests/data/indexes/repository-alternate-02/simple/index.html new file mode 100644 index 00000000000..c5e6f02039c --- /dev/null +++ b/tests/data/indexes/repository-alternate-02/simple/index.html @@ -0,0 +1,10 @@ + + + + + + + +simple-1.0.tar.gz + + diff --git a/tests/data/indexes/repository-tracks-01/simple/index.html b/tests/data/indexes/repository-tracks-01/simple/index.html new file mode 100644 index 00000000000..5b7aa5ac26f --- /dev/null +++ b/tests/data/indexes/repository-tracks-01/simple/index.html @@ -0,0 +1,10 @@ + + + + + + + +simple-1.0.tar.gz + + diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 882f82ae4fe..1c7d1ceae42 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -6,7 +6,7 @@ import uuid from pathlib import Path from textwrap import dedent -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Set from unittest import mock import pytest @@ -482,10 +482,18 @@ def test_parse_links__requires_python( # in test_download.py execute over both html and json indices with # a pytest.mark.parameterize decorator to ensure nothing slips through the cracks. def test_parse_links_json() -> None: + meta_tracks_alt_repos = [ + "https://example.com/simple/holygrail/", + "https://example.net/simple/holygrail/", + ] json_bytes = json.dumps( { - "meta": {"api-version": "1.0"}, + "meta": { + "api-version": "1.0", + "tracks": list(meta_tracks_alt_repos), + }, "name": "holygrail", + "alternate-locations": list(meta_tracks_alt_repos), "files": [ { "filename": "holygrail-1.0.tar.gz", @@ -529,23 +537,26 @@ def test_parse_links_json() -> None: ], } ).encode("utf8") + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. + url = f"https://example.com/simple-{uuid.uuid4()}/" page = IndexContent( json_bytes, "application/vnd.pypi.simple.v1+json", encoding=None, - # parse_links() is cached by url, so we inject a random uuid to ensure - # the page content isn't cached. - url=f"https://example.com/simple-{uuid.uuid4()}/", + url=url, ) links = list(parse_links(page)) - assert links == [ + expected = [ Link( "https://example.com/files/holygrail-1.0.tar.gz", comes_from=page.url, requires_python=">=3.7", yanked_reason="Had a vulnerability", hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -553,6 +564,8 @@ def test_parse_links_json() -> None: requires_python=">=3.7", yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -561,6 +574,8 @@ def test_parse_links_json() -> None: yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, metadata_file_data=MetadataFile({"sha512": "aabdd41"}), + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -569,6 +584,8 @@ def test_parse_links_json() -> None: yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, metadata_file_data=MetadataFile({"sha512": "aabdd41"}), + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -577,9 +594,17 @@ def test_parse_links_json() -> None: yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, metadata_file_data=MetadataFile({"sha512": "aabdd41"}), + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), ] + def _convert_links(link_item: Link) -> dict[str, Optional[str]]: + return {key: getattr(link_item, key, None) for key in link_item.__slots__} + + for index, link in enumerate(links): + assert _convert_links(link) == _convert_links(expected[index]) + # Ensure the metadata info can be parsed into the correct link. metadata_link = links[2].metadata_link() assert metadata_link is not None @@ -721,6 +746,46 @@ def test_parse_links_caches_same_page_by_url() -> None: assert "pkg2" in parsed_links_3[0].url +@pytest.mark.parametrize( + ("index_name", "expected_project_track_urls", "expected_repo_alt_urls"), + [ + ( + "repository-alternate-01", + set(), + { + "../../repository-alternate-02/simple/", + "../../repository-tracks-01/simple/", + }, + ), + ("repository-alternate-02", set(), {"../../repository-alternate-01/simple/"}), + ("repository-tracks-01", {"../../repository-alternate-01/simple/"}, set()), + ], +) +def test_parse_links__alternate_locations_and_tracks( + index_name: str, + expected_project_track_urls: Set[str], + expected_repo_alt_urls: Set[str], + data: TestData, +) -> None: + package_name = "simple" + html_path = data.indexes / index_name / package_name / "index.html" + html = html_path.read_text() + html_bytes = html.encode("utf-8") + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. + url = f"https://example.com/simple-{uuid.uuid4()}/" + page = IndexContent( + html_bytes, + "text/html", + encoding=None, + url=url, + ) + links = list(parse_links(page)) + for link in links: + assert link.project_track_urls == expected_project_track_urls + assert link.repo_alt_urls == {*expected_repo_alt_urls, url} + + @mock.patch("pip._internal.index.collector.raise_for_status") def test_request_http_error( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index d02c70b260e..ba9f8b21ac0 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -1,11 +1,19 @@ import logging +import uuid from typing import FrozenSet, List, Optional, Set, Tuple import pytest +from pip._internal.models.candidate import InstallationCandidate from pip._vendor.packaging.specifiers import SpecifierSet from pip._vendor.packaging.tags import Tag -from pip._internal.index.collector import LinkCollector +from pip._internal.index.collector import LinkCollector, IndexContent +from pip._internal.exceptions import ( + InvalidAlternativeLocationsUrl, + InvalidMultipleRemoteRepositories, + InvalidTracksUrl, + UnsafeMultipleRemoteRepositories, +) from pip._internal.index.package_finder import ( CandidateEvaluator, CandidatePreferences, @@ -16,6 +24,7 @@ _check_link_requires_python, _extract_version_from_fragment, _find_name_version_sep, + check_multiple_remote_repositories, filter_unallowed_hashes, ) from pip._internal.models.link import Link @@ -901,3 +910,184 @@ def test_extract_version_from_fragment( ) -> None: version = _extract_version_from_fragment(fragment, canonical_name) assert version == expected + + +def _make_mock_candidate_check_remote_repo( + candidate_name: Optional[str] = None, + version: Optional[str] = None, + comes_from_url: Optional[str] = None, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, +) -> InstallationCandidate: + if candidate_name is None: + candidate_name = "mypackage" + + if version is None: + version = "1.0" + + comes_from = None + if comes_from_url is not None: + comes_from = IndexContent( + b"", + "text/html", + encoding=None, + url=comes_from_url, + ) + + url = f"https://example.com/pkg-{version}.tar.gz" + + link = Link( + url, + comes_from=comes_from, + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, + ) + candidate = InstallationCandidate(candidate_name, version, link) + + return candidate + + +@pytest.mark.parametrize( + "candidates, project_name, expected", + [ + # checks pass when no candidates + ([], "my_package", None), + # checks pass when only one candidate + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url="https://a.example.com/simple/mypackage", + ) + ], + "mypackage", + None, + ), + # checks fail when two candidates with different remotes + # and no metadata to enable merging namespaces + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url="https://a.example.com/simple/mypackage", + ), + _make_mock_candidate_check_remote_repo( + comes_from_url="https://b.example.com/simple/mypackage", + ), + ], + "mypackage", + UnsafeMultipleRemoteRepositories, + ), + # checks pass when only one candidate with tracks url + # TODO: not making requests to repos revealed via metadata + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url="https://a.example.com/simple/mypackage", + ), + _make_mock_candidate_check_remote_repo( + comes_from_url="https://b.example.com/simple/mypackage", + project_track_urls={"https://a.example.com/simple/mypackage"}, + ), + ], + "mypackage", + None, + ), + # checks pass when ony one candidate with alt loc url + # TODO: not making requests to repos revealed via metadata + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url="https://a.example.com/simple/mypackage", + ), + _make_mock_candidate_check_remote_repo( + comes_from_url="https://b.example.com/simple/mypackage", + repo_alt_urls={"https://a.example.com/simple/mypackage"}, + ), + ], + "mypackage", + None, + ), + # checks fail when alternate location urls do not agree + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url="https://a.example.com/simple/mypackage", + repo_alt_urls={"https://b.example.com/simple/mypackage"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url="https://b.example.com/simple/mypackage", + repo_alt_urls={"https://c.example.com/simple/mypackage"}, + ), + ], + "mypackage", + InvalidAlternativeLocationsUrl, + ), + # checks fails when track url points to the base url instead of + # the project url + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url="https://a.example.com/simple/mypackage", + project_track_urls={"https://b.example.com"}, + ), + ], + "mypackage", + InvalidTracksUrl, + ), + # checks fail when track url points to another tracker instead of + # the 'owner' project url + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url="https://a.example.com/simple/mypackage", + project_track_urls={"https://b.example.com/simple/mypackage"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url="https://b.example.com/simple/mypackage", + project_track_urls={"https://c.example.com/simple/mypackage"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url="https://c.example.com/simple/mypackage", + ), + ], + "mypackage", + InvalidTracksUrl, + ), + # checks fail when track url points to different name instead of + # the project url with same name + ( + [ + _make_mock_candidate_check_remote_repo( + candidate_name="othername", + comes_from_url="https://a.example.com/simple/othername", + project_track_urls={"https://b.example.com/simple/othername"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url="https://b.example.com/simple/mypackage", + ), + ], + "mypackage", + InvalidTracksUrl, + ), + # checks pass when track url points to same normalized name + ( + [ + _make_mock_candidate_check_remote_repo( + candidate_name="a.b-c_d", + comes_from_url="https://a.example.com/simple/mypackage", + project_track_urls={"https://b.example.com/simple/a_b.c-d"}, + ), + ], + "a-b_c.d", + None, + ), + ], +) +def test_check_multiple_remote_repositories( + caplog, candidates: List[InstallationCandidate], project_name: str, expected +): + caplog.set_level(logging.DEBUG) + if expected: + with pytest.raises(expected): + check_multiple_remote_repositories(candidates, project_name) + else: + assert check_multiple_remote_repositories(candidates, project_name) is None