From 618af7aec63d45316e29a715fe6f8582eabf1e43 Mon Sep 17 00:00:00 2001 From: Mark Cottman-Fields Date: Sun, 30 Jun 2024 20:39:58 +1000 Subject: [PATCH 1/6] add repository information parsing --- news/11784.feature.rst | 1 + src/pip/_internal/index/collector.py | 27 +++++++++++++++++++++++++-- src/pip/_internal/models/link.py | 20 ++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 news/11784.feature.rst diff --git a/news/11784.feature.rst b/news/11784.feature.rst new file mode 100644 index 00000000000..c2a0ce56040 --- /dev/null +++ b/news/11784.feature.rst @@ -0,0 +1 @@ +Introduce repository alternate locations and project tracking, as per PEP 708. \ No newline at end of file diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 5f8fdee3d46..bedb0f6638f 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -230,7 +230,12 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): data = json.loads(page.content) for file in data.get("files", []): - link = Link.from_json(file, page.url) + link = Link.from_json( + file, + page_url=page.url, + project_track_urls=data.get("meta", {}).get('tracks', []), + repo_alt_urls=data.get("alternate-locations", []), + ) if link is None: continue yield link @@ -243,7 +248,13 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: url = page.url base_url = parser.base_url or url for anchor in parser.anchors: - link = Link.from_element(anchor, page_url=url, base_url=base_url) + link = Link.from_element( + anchor, + page_url=url, + base_url=base_url, + project_track_urls=parser.project_track_urls, + repo_alt_urls=parser.repo_alt_urls, + ) if link is None: continue yield link @@ -282,6 +293,8 @@ def __init__(self, url: str) -> None: self.url: str = url self.base_url: Optional[str] = None self.anchors: List[Dict[str, Optional[str]]] = [] + self.project_track_urls: List[str] = [] + self.repo_alt_urls: List[str] = [] def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: if tag == "base" and self.base_url is None: @@ -290,6 +303,16 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N self.base_url = href elif tag == "a": self.anchors.append(dict(attrs)) + elif tag == "meta": + for name, value in attrs: + if not value or not value.strip(): + continue + url = value.strip() + # PEP 708 + if name == "pypi:tracks" and url not in self.project_track_urls: + self.project_track_urls.append(url) + elif name == "pypi:alternate-locations" and url not in self.repo_alt_urls: + self.repo_alt_urls.append(url) def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: for name, value in attrs: diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2f41f2f6a09..5f7d327fdf5 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -192,6 +192,8 @@ class Link: "metadata_file_data", "cache_link_parsing", "egg_fragment", + "project_track_urls", + "repo_alt_urls", ] def __init__( @@ -203,6 +205,8 @@ def __init__( metadata_file_data: Optional[MetadataFile] = None, cache_link_parsing: bool = True, hashes: Optional[Mapping[str, str]] = None, + project_track_urls: Optional[List[str]] = None, + repo_alt_urls: Optional[List[str]] = None, ) -> None: """ :param url: url of the resource pointed to (href of the link) @@ -227,6 +231,10 @@ def __init__( URLs should generally have this set to False, for example. :param hashes: A mapping of hash names to digests to allow us to determine the validity of a download. + :param project_track_urls: An optional list of urls pointing to the same + project in other repositories. Defined by the repository operators. + :param repo_alt_urls: An optional list of urls pointing to alternate + locations for the project. Defined by the project owners. """ # The comes_from, requires_python, and metadata_file_data arguments are @@ -257,11 +265,17 @@ def __init__( self.cache_link_parsing = cache_link_parsing self.egg_fragment = self._egg_fragment() + # PEP 708 + self.project_track_urls = project_track_urls or [] + self.repo_alt_urls = repo_alt_urls or [] + @classmethod def from_json( cls, file_data: Dict[str, Any], page_url: str, + project_track_urls: Optional[List[str]] = None, + repo_alt_urls: Optional[List[str]] = None, ) -> Optional["Link"]: """ Convert an pypi json document from a simple repository page into a Link. @@ -306,6 +320,8 @@ def from_json( yanked_reason=yanked_reason, hashes=hashes, metadata_file_data=metadata_file_data, + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, ) @classmethod @@ -314,6 +330,8 @@ def from_element( anchor_attribs: Dict[str, Optional[str]], page_url: str, base_url: str, + project_track_urls: Optional[List[str]] = None, + repo_alt_urls: Optional[List[str]] = None, ) -> Optional["Link"]: """ Convert an anchor element's attributes in a simple repository page to a Link. @@ -358,6 +376,8 @@ def from_element( requires_python=pyrequire, yanked_reason=yanked_reason, metadata_file_data=metadata_file_data, + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, ) def __str__(self) -> str: From 02c5f4afcabf529b8533f2793d2aa311898a8196 Mon Sep 17 00:00:00 2001 From: Mark Cottman-Fields Date: Sat, 27 Jul 2024 17:17:31 +1000 Subject: [PATCH 2/6] add tests for parsing repository details --- src/pip/_internal/index/collector.py | 58 ++++++++++----- src/pip/_internal/models/link.py | 21 +++--- .../repository-alternate-01/simple/index.html | 11 +++ .../repository-alternate-02/simple/index.html | 10 +++ .../repository-tracks-01/simple/index.html | 10 +++ tests/unit/test_collector.py | 71 ++++++++++++++++++- 6 files changed, 153 insertions(+), 28 deletions(-) create mode 100644 tests/data/indexes/repository-alternate-01/simple/index.html create mode 100644 tests/data/indexes/repository-alternate-02/simple/index.html create mode 100644 tests/data/indexes/repository-tracks-01/simple/index.html diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index bedb0f6638f..69bb812f5c9 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -24,6 +24,7 @@ Optional, Protocol, Sequence, + Set, Tuple, Union, ) @@ -33,7 +34,12 @@ from pip._vendor.requests.exceptions import RetryError, SSLError from pip._internal.exceptions import NetworkConnectionError -from pip._internal.models.link import Link +from pip._internal.models.link import ( + Link, + HEAD_META_PREFIX, + HEAD_META_ALTERNATE_LOCATIONS, + HEAD_META_TRACKS, +) from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession from pip._internal.network.utils import raise_for_status @@ -224,17 +230,21 @@ def wrapper_wrapper(page: "IndexContent") -> List[Link]: def parse_links(page: "IndexContent") -> Iterable[Link]: """ Parse a Simple API's Index Content, and yield its anchor elements as Link objects. + Includes known metadata from the HTML header. """ - + url = page.url content_type_l = page.content_type.lower() if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): data = json.loads(page.content) + project_track_urls = set(data.get("meta", {}).get("tracks", [])) + repo_alt_urls = set(data.get("alternate-locations", [])) + repo_alt_urls.add(page.url) for file in data.get("files", []): link = Link.from_json( file, page_url=page.url, - project_track_urls=data.get("meta", {}).get('tracks', []), - repo_alt_urls=data.get("alternate-locations", []), + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, ) if link is None: continue @@ -245,15 +255,16 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: encoding = page.encoding or "utf-8" parser.feed(page.content.decode(encoding)) - url = page.url base_url = parser.base_url or url for anchor in parser.anchors: + repo_alt_urls = parser.repo_alt_urls or set() + repo_alt_urls.add(page.url) link = Link.from_element( anchor, page_url=url, base_url=base_url, project_track_urls=parser.project_track_urls, - repo_alt_urls=parser.repo_alt_urls, + repo_alt_urls=repo_alt_urls, ) if link is None: continue @@ -293,8 +304,8 @@ def __init__(self, url: str) -> None: self.url: str = url self.base_url: Optional[str] = None self.anchors: List[Dict[str, Optional[str]]] = [] - self.project_track_urls: List[str] = [] - self.repo_alt_urls: List[str] = [] + self.project_track_urls: Set[str] = set() + self.repo_alt_urls: Set[str] = set() def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: if tag == "base" and self.base_url is None: @@ -304,15 +315,20 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N elif tag == "a": self.anchors.append(dict(attrs)) elif tag == "meta": - for name, value in attrs: - if not value or not value.strip(): - continue - url = value.strip() - # PEP 708 - if name == "pypi:tracks" and url not in self.project_track_urls: - self.project_track_urls.append(url) - elif name == "pypi:alternate-locations" and url not in self.repo_alt_urls: - self.repo_alt_urls.append(url) + meta_attrs = dict(attrs) + meta_key = meta_attrs.get("name", "").strip() + meta_val = meta_attrs.get("content", "").strip() + if meta_key and meta_val: + if ( + meta_key == self._meta_key_tracks + and meta_val not in self.project_track_urls + ): + self.project_track_urls.add(meta_val) + elif ( + meta_key == self._meta_key_alternate_locations + and meta_val not in self.repo_alt_urls + ): + self.repo_alt_urls.add(meta_val) def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: for name, value in attrs: @@ -320,6 +336,14 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: return value return None + @functools.cached_property + def _meta_key_tracks(self): + return f"{HEAD_META_PREFIX}:{HEAD_META_TRACKS}" + + @functools.cached_property + def _meta_key_alternate_locations(self): + return f"{HEAD_META_PREFIX}:{HEAD_META_ALTERNATE_LOCATIONS}" + def _handle_get_simple_fail( link: Link, diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 5f7d327fdf5..2bdbb827029 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -14,6 +14,7 @@ Mapping, NamedTuple, Optional, + Set, Tuple, Union, ) @@ -39,6 +40,10 @@ # we will pick to use. _SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5") +HEAD_META_PREFIX = "pypi" +HEAD_META_ALTERNATE_LOCATIONS = "alternate-locations" +HEAD_META_TRACKS = "tracks" + @dataclass(frozen=True) class LinkHash: @@ -205,8 +210,8 @@ def __init__( metadata_file_data: Optional[MetadataFile] = None, cache_link_parsing: bool = True, hashes: Optional[Mapping[str, str]] = None, - project_track_urls: Optional[List[str]] = None, - repo_alt_urls: Optional[List[str]] = None, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, ) -> None: """ :param url: url of the resource pointed to (href of the link) @@ -266,16 +271,16 @@ def __init__( self.egg_fragment = self._egg_fragment() # PEP 708 - self.project_track_urls = project_track_urls or [] - self.repo_alt_urls = repo_alt_urls or [] + self.project_track_urls = project_track_urls or set() + self.repo_alt_urls = repo_alt_urls or set() @classmethod def from_json( cls, file_data: Dict[str, Any], page_url: str, - project_track_urls: Optional[List[str]] = None, - repo_alt_urls: Optional[List[str]] = None, + project_track_urls: Optional[set[str]] = None, + repo_alt_urls: Optional[set[str]] = None, ) -> Optional["Link"]: """ Convert an pypi json document from a simple repository page into a Link. @@ -330,8 +335,8 @@ def from_element( anchor_attribs: Dict[str, Optional[str]], page_url: str, base_url: str, - project_track_urls: Optional[List[str]] = None, - repo_alt_urls: Optional[List[str]] = None, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, ) -> Optional["Link"]: """ Convert an anchor element's attributes in a simple repository page to a Link. diff --git a/tests/data/indexes/repository-alternate-01/simple/index.html b/tests/data/indexes/repository-alternate-01/simple/index.html new file mode 100644 index 00000000000..e9174877f7d --- /dev/null +++ b/tests/data/indexes/repository-alternate-01/simple/index.html @@ -0,0 +1,11 @@ + + + + + + + + +simple-1.0.tar.gz + + diff --git a/tests/data/indexes/repository-alternate-02/simple/index.html b/tests/data/indexes/repository-alternate-02/simple/index.html new file mode 100644 index 00000000000..c5e6f02039c --- /dev/null +++ b/tests/data/indexes/repository-alternate-02/simple/index.html @@ -0,0 +1,10 @@ + + + + + + + +simple-1.0.tar.gz + + diff --git a/tests/data/indexes/repository-tracks-01/simple/index.html b/tests/data/indexes/repository-tracks-01/simple/index.html new file mode 100644 index 00000000000..5b7aa5ac26f --- /dev/null +++ b/tests/data/indexes/repository-tracks-01/simple/index.html @@ -0,0 +1,10 @@ + + + + + + + +simple-1.0.tar.gz + + diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 89da25b73d8..7a8bc9efa5e 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -482,10 +482,18 @@ def test_parse_links__requires_python( # in test_download.py execute over both html and json indices with # a pytest.mark.parameterize decorator to ensure nothing slips through the cracks. def test_parse_links_json() -> None: + meta_tracks_alt_repos = [ + "https://example.com/simple/holygrail/", + "https://example.net/simple/holygrail/", + ] json_bytes = json.dumps( { - "meta": {"api-version": "1.0"}, + "meta": { + "api-version": "1.0", + "tracks": list(meta_tracks_alt_repos), + }, "name": "holygrail", + "alternate-locations": list(meta_tracks_alt_repos), "files": [ { "filename": "holygrail-1.0.tar.gz", @@ -529,23 +537,26 @@ def test_parse_links_json() -> None: ], } ).encode("utf8") + url = f"https://example.com/simple-{uuid.uuid4()}/" page = IndexContent( json_bytes, "application/vnd.pypi.simple.v1+json", encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure # the page content isn't cached. - url=f"https://example.com/simple-{uuid.uuid4()}/", + url=url, ) links = list(parse_links(page)) - assert links == [ + expected = [ Link( "https://example.com/files/holygrail-1.0.tar.gz", comes_from=page.url, requires_python=">=3.7", yanked_reason="Had a vulnerability", hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -553,6 +564,8 @@ def test_parse_links_json() -> None: requires_python=">=3.7", yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -561,6 +574,8 @@ def test_parse_links_json() -> None: yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, metadata_file_data=MetadataFile({"sha512": "aabdd41"}), + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -569,6 +584,8 @@ def test_parse_links_json() -> None: yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, metadata_file_data=MetadataFile({"sha512": "aabdd41"}), + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), Link( "https://example.com/files/holygrail-1.0-py3-none-any.whl", @@ -577,9 +594,17 @@ def test_parse_links_json() -> None: yanked_reason=None, hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, metadata_file_data=MetadataFile({"sha512": "aabdd41"}), + project_track_urls=set(meta_tracks_alt_repos), + repo_alt_urls={*meta_tracks_alt_repos, url}, ), ] + def _convert_links(link_item): + return {key: getattr(link_item, key, None) for key in link_item.__slots__} + + for index, link in enumerate(links): + assert _convert_links(link) == _convert_links(expected[index]) + # Ensure the metadata info can be parsed into the correct link. metadata_link = links[2].metadata_link() assert metadata_link is not None @@ -721,6 +746,46 @@ def test_parse_links_caches_same_page_by_url() -> None: assert "pkg2" in parsed_links_3[0].url +@pytest.mark.parametrize( + ("index_name", "expected_project_track_urls", "expected_repo_alt_urls"), + [ + ( + "repository-alternate-01", + set(), + { + "../../repository-alternate-02/simple/", + "../../repository-tracks-01/simple/", + }, + ), + ("repository-alternate-02", set(), {"../../repository-alternate-01/simple/"}), + ("repository-tracks-01", {"../../repository-alternate-01/simple/"}, set()), + ], +) +def test_parse_links__alternate_locations_and_tracks( + index_name: str, + expected_project_track_urls: list[str], + expected_repo_alt_urls: list[str], + data: TestData, +) -> None: + package_name = "simple" + html_path = data.indexes / index_name / package_name / "index.html" + html = html_path.read_text() + html_bytes = html.encode("utf-8") + url = f"https://example.com/simple-{uuid.uuid4()}/" + page = IndexContent( + html_bytes, + "text/html", + encoding=None, + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. + url=url, + ) + links = list(parse_links(page)) + for link in links: + assert link.project_track_urls == expected_project_track_urls + assert link.repo_alt_urls == {*expected_repo_alt_urls, url} + + @mock.patch("pip._internal.index.collector.raise_for_status") def test_request_http_error( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture From b36be68e531191c340f50665a6be4758aa4716d8 Mon Sep 17 00:00:00 2001 From: Mark Cottman-Fields Date: Sat, 3 Aug 2024 15:18:48 +1000 Subject: [PATCH 3/6] add initial checks from specification Add exceptions. Fix some lint issues. --- news/11784.feature.rst | 2 +- src/pip/_internal/exceptions.py | 102 ++++++++++ src/pip/_internal/index/collector.py | 8 +- src/pip/_internal/index/package_finder.py | 225 ++++++++++++++++++++++ tests/unit/test_collector.py | 6 +- tests/unit/test_index.py | 4 + 6 files changed, 339 insertions(+), 8 deletions(-) diff --git a/news/11784.feature.rst b/news/11784.feature.rst index c2a0ce56040..6d9c77d23d8 100644 --- a/news/11784.feature.rst +++ b/news/11784.feature.rst @@ -1 +1 @@ -Introduce repository alternate locations and project tracking, as per PEP 708. \ No newline at end of file +Introduce repository alternate locations and project tracking, as per PEP 708. diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 2587740f73a..2dd4c37145b 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -775,3 +775,105 @@ def __init__(self, *, distribution: "BaseDistribution") -> None: ), hint_stmt=None, ) + + +class InvalidMultipleRemoteRepositories(DiagnosticPipError): + """Common error for issues with multiple remote repositories.""" + + reference = "invalid-multiple-remote-repositories" + + +class InvalidTracksUrl(InvalidMultipleRemoteRepositories): + """There was an issue with a Tracks metadata url. + + Tracks urls must point to the actual URLs for that project, + point to the repositories that own the namespaces, and + point to a project with the exact same name (after normalization). + """ + + reference = "invalid-tracks-url" + + +class InvalidAlternativeLocationsUrl(InvalidMultipleRemoteRepositories): + """The list of Alternate Locations for each repository do not match. + + In order for this metadata to be trusted, there MUST be agreement between + all locations where that project is found as to what the alternate locations are. + """ + + reference = "invalid-alternative-locations" + + def __init__( + self, + *, + package: str, + remote_repositories: set[str], + invalid_locations: set[str], + ) -> None: + super().__init__( + kind="error", + message=Text( + f"One or more Alternate Locations for {escape(package)} " + "were different among the remote repositories. " + "The remote repositories are " + f"{'; '.join(sorted(escape(r) for r in remote_repositories))}." + "The alternate locations not not agreed by all remote " + "repository are " + f"{'; '.join(sorted(escape(r) for r in invalid_locations))}." + ), + context=Text( + "To be able to trust the remote repository Alternate Locations, " + "all remote repositories must agree on the list of Locations." + ), + hint_stmt=None, + note_stmt=Text( + "The way to resolve this error is to contact the owners of the package " + "at each remote repository, and ask that the list of " + "Alternate Locations at each is set to the same list. " + "See PEP 708 for the specification. " + "You can override this check, which will disable the security " + "protection it provides from dependency confusion attacks, " + "by passing --insecure-multiple-remote-repositories." + ), + ) + + +class UnsafeMultipleRemoteRepositories(InvalidMultipleRemoteRepositories): + """More than one remote repository was provided for a package, + with no indication that the remote repositories can be safely merged. + + The repositories, packages, or user did not indicate that + it is safe to merge remote repositories. + + Multiple remote repositories are not merged by default + to reduce the risk of dependency confusion attacks.""" + + reference = "unsafe-multiple-remote-repositories" + + def __init__(self, *, package: str, remote_repositories: set[str]) -> None: + super().__init__( + kind="error", + message=Text( + f"More than one remote repository was found for {escape(package)}, " + "with no indication that the remote repositories can be safely merged. " + f"The repositories are {'; '.join(sorted(escape(r) for r in remote_repositories))}." + ), + context=Text( + "Multiple remote repositories are not merged by default " + "to reduce the risk of dependency confusion attacks." + ), + hint_stmt=Text( + "Remote repositories can be specified or discovered using " + "--index-url, --extra-index-url, and --find-links. " + "Please check the pip command to see if these are in use." + ), + note_stmt=Text( + "The way to resolve this error is to contact the remote repositories " + "and package owners, and ask if it makes sense to configure them to " + "merge namespaces. " + "See PEP 708 for the specification. " + "You can override this check, which will disable the security " + "protection it provides from dependency confusion attacks, " + "by passing --insecure-multiple-remote-repositories." + ), + ) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 69bb812f5c9..8a31d442450 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -35,10 +35,10 @@ from pip._internal.exceptions import NetworkConnectionError from pip._internal.models.link import ( - Link, - HEAD_META_PREFIX, HEAD_META_ALTERNATE_LOCATIONS, + HEAD_META_PREFIX, HEAD_META_TRACKS, + Link, ) from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession @@ -337,11 +337,11 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: return None @functools.cached_property - def _meta_key_tracks(self): + def _meta_key_tracks(self) -> str: return f"{HEAD_META_PREFIX}:{HEAD_META_TRACKS}" @functools.cached_property - def _meta_key_alternate_locations(self): + def _meta_key_alternate_locations(self) -> str: return f"{HEAD_META_PREFIX}:{HEAD_META_ALTERNATE_LOCATIONS}" diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 0d65ce35f37..4e77ec395fc 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -17,7 +17,9 @@ from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, + InvalidAlternativeLocationsUrl, InvalidWheelFilename, + UnsafeMultipleRemoteRepositories, UnsupportedWheel, ) from pip._internal.index.collector import LinkCollector, parse_links @@ -475,6 +477,11 @@ def get_applicable_candidates( project_name=self._project_name, ) + check_multiple_remote_repositories( + candidates=filtered_applicable_candidates, + project_name=self._project_name, + ) + return sorted(filtered_applicable_candidates, key=self._sort_key) def _sort_key(self, candidate: InstallationCandidate) -> CandidateSortingKey: @@ -1018,3 +1025,221 @@ def _extract_version_from_fragment(fragment: str, canonical_name: str) -> Option if not version: return None return version + + +def check_multiple_remote_repositories( + candidates: List[InstallationCandidate], project_name: str +) -> None: + """ + Check whether two or more different namespaces can be flattened into one. + + This check will raise an error when packages from different sources will be merged, + without clear configuration to indicate the namespace merge is allowed. + See `PEP 708`_. + + This approach allows safely merging separate namespaces that are actually one + logical namespace, while enforcing a more secure default. + + Returns None if checks pass, otherwise will raise an error with details of the + failed checks. + """ + + # NOTE: The checks in this function must occur after: + # Specification: Filter out any files that do not match known hashes from a + # lockfile or requirements file. + + # NOTE: The checks in this function must occur before: + # Specification: Filter out any files that do not match the current platform, + # Python version, etc. We check for the metadata in this PEP before filtering out + # based on platform, Python version, etc., because we don’t want errors that only + # show up on certain platforms, Python versions, etc. + + # NOTE: Implemented in 'filter_unallowed_hashes': + # Specification: Users who are using lock files or requirements files that include + # specific hashes of artifacts that are “valid” are assumed to be protected by + # nature of those hashes, since the rest of these recommendations would apply + # during hash generation. Thus, we filter out unknown hashes up front. + + # NOTE: Implemented in 'collector.py' 'parse_links': + # Specification: When using alternate locations, clients MUST implicitly assume + # that the url the response was fetched from was included in the list. + + # NOTE: Implemented in 'collector.py' 'parse_links': + # Specification: Order of the elements within the array does not have any + # particular meaning. + + # NOTE: Implemented by this function, by not checking all repo tracks metadata is + # the exact same. + # Specification: Mixed use repositories where some names track a repository and + # some names do not are explicitly allowed. + + # TODO: This requirement doesn't look like something that pip can do anything about. + # Specification: All [Tracks metadata] URLs MUST represent the same “project” as + # the project in the extending repository. This does not mean that they need to + # serve the same files. It is valid for them to include binaries built on + # different platforms, copies with local patches being applied, etc. This is + # purposefully left vague as it’s ultimately up to the expectations that the + # users have of the repository and its operators what exactly constitutes + # the “same” project. + + # TODO: This requirement doesn't look like something that pip can do anything about. + # Specification: It [Tracks metadata] is NOT required that every name in a + # repository tracks the same repository, or that they all track a repository at all. + + # TODO: This requirement doesn't look like something that pip can do anything about. + # Specification: It [repository Tracks metadata] MUST be under the control of the + # repository operators themselves, not any individual publisher using that + # repository. “Repository Operator” can also include anyone who managed the overall + # namespace for a particular repository, which may be the case in situations like + # hosted repository services where one entity operates the software but another + # owns/manages the entire namespace of that repository. + + # TODO: Consider making requests to repositories revealed via Alternate Locations + # or Tracks metadata, to assess the metadata of those additional repositories. + # Specification: When an installer encounters a project that is using the + # alternate locations metadata it SHOULD consider that all repositories named are + # extending the same namespace across multiple repositories. + + # TODO: Consider whether pip already has, or should add, ways to indicate exactly + # which individual projects to get from which individual repositories. + # Specification: If the user has explicitly told the installer that it wants to + # fetch a project from a certain set of repositories, then there is no reason + # to question that and we assume that they’ve made sure it is safe to merge + # those namespaces. If the end user has explicitly told the installer to fetch + # the project from specific repositories, filter out all other repositories. + + # When no candidates are provided, then no checks are relevant, so just return. + if candidates is None or len(candidates) == 0: + logger.debug("No candidates given to multiple remote repository checks") + return + + # Pre-calculate the canonical name for later comparisons. + canonical_name = canonicalize_name(project_name) + + # Specification: Look to see if the discovered files span multiple repositories; + # if they do then determine if either “Tracks” or “Alternate Locations” metadata + # allows safely merging ALL the repositories where files were discovered together. + scheme_file = "file://" + remote_candidates = [] + all_remote_urls = set() + all_project_track_urls = [] + all_repo_alt_urls = [] + remote_repositories = set() + for candidate in candidates: + candidate_name = candidate.name + candidate_canonical_name = canonicalize_name(candidate_name) + + file_path = None + try: + file_path = candidate.link.file_path + except Exception: + pass + url = candidate.link.url + + project_track_urls = candidate.link.project_track_urls or set() + all_project_track_urls.append(project_track_urls) + remote_repositories.update(project_track_urls) + + repo_alt_urls = candidate.link.repo_alt_urls or set() + all_repo_alt_urls.append(repo_alt_urls) + remote_repositories.update(repo_alt_urls) + + other_repo_urls = {*project_track_urls, *repo_alt_urls} + has_other_repo_urls = len(other_repo_urls) > 0 + + is_local_url = url and url.startswith(scheme_file) + if url and not is_local_url: + all_remote_urls.add(url) + + # Specification: Repositories that exist on the local filesystem SHOULD always + # be implicitly allowed to be merged to any remote repository. + is_local_file = file_path or is_local_url + has_remote_repo = has_other_repo_urls and any( + not i.startswith(scheme_file) for i in other_repo_urls + ) + if is_local_file and not has_remote_repo: + # Ignore any local candidates in later comparisons. + logger.debug( + "Ignore local candidate %s in multiple remote repository checks", + candidate, + ) + continue + + remote_candidates.append( + { + "name": candidate_name, + "canonical_name": candidate_canonical_name, + "file_path": file_path, + "url": url, + "project_track_urls": project_track_urls, + "repo_alt_urls": repo_alt_urls, + "other_repo_urls": other_repo_urls, + "is_local_file": is_local_file, + "has_remote_repo": has_remote_repo, + } + ) + + # If there are no remote candidates, then allow merging repositories. + if len(remote_candidates) == 0: + logger.debug("No remote candidates for multiple remote repository checks") + return + + # TODO + if logger.isEnabledFor(logging.INFO): + logger.info("Remote candidates for multiple remote repository checks:") + for candidate in remote_candidates: + logger.info(candidate) + + # TODO: This checks the list of Alternate Locations for the candidates that were + # retrieved. It does not request the metadata from any additional repositories + # revealed via the lists of Alternate Locations. + # Specification: In order for this metadata to be trusted, there MUST be agreement + # between all locations where that project is found as to what the alternate + # locations are. + if len(all_repo_alt_urls) > 1: + match_alt_locs = set(all_repo_alt_urls[0]) + invalid_locations = set() + + for item in all_repo_alt_urls[1:]: + match_alt_locs.intersection_update(item) + invalid_locations.update(match_alt_locs.symmetric_difference(item)) + + if len(invalid_locations) > 0: + raise InvalidAlternativeLocationsUrl( + package=project_name, + remote_repositories=remote_repositories, + invalid_locations=invalid_locations, + ) + + # TODO + # Specification: Otherwise [if metadata allows] we merge the namespaces, + # and continue on. + + # TODO + # Specification: It [Tracks metadata] MUST point to the actual URLs for that + # project, not the base URL for the extended repositories. + # raise InvalidTracksUrl + + # TODO + # Specification: It [Tracks metadata] MUST point to the repositories that “own” + # the namespaces, not another repository that is also tracking that namespace. + # raise InvalidTracksUrl + + # TODO + # Specification: It [Tracks metadata] MUST point to a project with the exact same + # name (after normalization). + # raise InvalidTracksUrl + + # TODO + # Specification: If the project in question only comes from a single repository, + # then there is no chance of dependency confusion, so there’s no reason to do + # anything but allow. + + # Specification: If nothing tells us merging the namespaces is safe, we refuse to + # implicitly assume it is, and generate an error instead. + # Specification: If that metadata does NOT allow [merging namespaces], then + # generate an error. + error = UnsafeMultipleRemoteRepositories( + package=project_name, remote_repositories=remote_repositories + ) + raise error diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 7a8bc9efa5e..45c0f138f71 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -599,7 +599,7 @@ def test_parse_links_json() -> None: ), ] - def _convert_links(link_item): + def _convert_links(link_item: Link) -> dict[str, Optional[str]]: return {key: getattr(link_item, key, None) for key in link_item.__slots__} for index, link in enumerate(links): @@ -763,8 +763,8 @@ def test_parse_links_caches_same_page_by_url() -> None: ) def test_parse_links__alternate_locations_and_tracks( index_name: str, - expected_project_track_urls: list[str], - expected_repo_alt_urls: list[str], + expected_project_track_urls: set[str], + expected_repo_alt_urls: set[str], data: TestData, ) -> None: package_name = "simple" diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 78837b94e8b..884f2f8beec 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -901,3 +901,7 @@ def test_extract_version_from_fragment( ) -> None: version = _extract_version_from_fragment(fragment, canonical_name) assert version == expected + + +class TestCheckMultipleRemoteRepositories: + pass From 0c5a3ef74af3aad814d9886f5b92337bab215ecd Mon Sep 17 00:00:00 2001 From: Mark Cottman-Fields Date: Thu, 19 Sep 2024 18:56:42 +1000 Subject: [PATCH 4/6] work in progress --- src/pip/_internal/exceptions.py | 50 ++++-- src/pip/_internal/index/package_finder.py | 154 +++++++++-------- src/pip/_internal/models/candidate.py | 18 ++ src/pip/_internal/models/link.py | 28 ++++ tests/unit/test_collector.py | 8 +- tests/unit/test_index.py | 192 +++++++++++++++++++++- 6 files changed, 363 insertions(+), 87 deletions(-) diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 2dd4c37145b..8e3654baf2a 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -781,6 +781,12 @@ class InvalidMultipleRemoteRepositories(DiagnosticPipError): """Common error for issues with multiple remote repositories.""" reference = "invalid-multiple-remote-repositories" + _note_suffix = ( + "See PEP 708 for the specification. " + "You can override this check, which will disable the security " + "protection it provides from dependency confusion attacks, " + "by passing --insecure-multiple-remote-repositories." + ) class InvalidTracksUrl(InvalidMultipleRemoteRepositories): @@ -793,6 +799,36 @@ class InvalidTracksUrl(InvalidMultipleRemoteRepositories): reference = "invalid-tracks-url" + def __init__( + self, + *, + package: str, + remote_repositories: set[str], + invalid_tracks: set[str], + ) -> None: + super().__init__( + kind="error", + message=Text( + f"One or more Tracks for {escape(package)} " + "were not valid. " + "The remote repositories are " + f"{'; '.join(sorted(escape(r) for r in remote_repositories))}." + "The invalid tracks are " + f"{'; '.join(sorted(escape(r) for r in invalid_tracks))}." + ), + context=Text( + "Tracks urls must point to the actual URLs for a project, " + "point to the repositories that own the namespaces, and " + "point to a project with the exact same normalized name." + ), + hint_stmt=None, + note_stmt=Text( + "The way to resolve this error is to contact the owners of " + "each remote repository, and ask if it makes sense to " + "configure them to merge namespaces. " + self._note_suffix + ), + ) + class InvalidAlternativeLocationsUrl(InvalidMultipleRemoteRepositories): """The list of Alternate Locations for each repository do not match. @@ -828,12 +864,8 @@ def __init__( hint_stmt=None, note_stmt=Text( "The way to resolve this error is to contact the owners of the package " - "at each remote repository, and ask that the list of " - "Alternate Locations at each is set to the same list. " - "See PEP 708 for the specification. " - "You can override this check, which will disable the security " - "protection it provides from dependency confusion attacks, " - "by passing --insecure-multiple-remote-repositories." + "at each remote repository, and ask if it makes sense to " + "configure them to merge namespaces. " + self._note_suffix ), ) @@ -870,10 +902,6 @@ def __init__(self, *, package: str, remote_repositories: set[str]) -> None: note_stmt=Text( "The way to resolve this error is to contact the remote repositories " "and package owners, and ask if it makes sense to configure them to " - "merge namespaces. " - "See PEP 708 for the specification. " - "You can override this check, which will disable the security " - "protection it provides from dependency confusion attacks, " - "by passing --insecure-multiple-remote-repositories." + "merge namespaces. " + self._note_suffix ), ) diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 4e77ec395fc..75a588e100f 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -4,7 +4,9 @@ import functools import itertools import logging +import pathlib import re +import urllib.parse from dataclasses import dataclass from typing import TYPE_CHECKING, FrozenSet, Iterable, List, Optional, Set, Tuple, Union @@ -19,6 +21,7 @@ DistributionNotFound, InvalidAlternativeLocationsUrl, InvalidWheelFilename, + InvalidTracksUrl, UnsafeMultipleRemoteRepositories, UnsupportedWheel, ) @@ -1099,9 +1102,13 @@ def check_multiple_remote_repositories( # Specification: When an installer encounters a project that is using the # alternate locations metadata it SHOULD consider that all repositories named are # extending the same namespace across multiple repositories. + # Implementation: It is not possible to enforce this requirement without access + # to the metadata for all of the remote repositories, as the Tracks metadata + # might point at a repository that does not exist, or at a repository that is + # Tracking another repository. # TODO: Consider whether pip already has, or should add, ways to indicate exactly - # which individual projects to get from which individual repositories. + # which individual projects to get from exactly which repositories. # Specification: If the user has explicitly told the installer that it wants to # fetch a project from a certain set of repositories, then there is no reason # to question that and we assume that they’ve made sure it is safe to merge @@ -1113,77 +1120,62 @@ def check_multiple_remote_repositories( logger.debug("No candidates given to multiple remote repository checks") return - # Pre-calculate the canonical name for later comparisons. + # Calculate the canonical name for later comparisons. canonical_name = canonicalize_name(project_name) # Specification: Look to see if the discovered files span multiple repositories; # if they do then determine if either “Tracks” or “Alternate Locations” metadata - # allows safely merging ALL the repositories where files were discovered together. - scheme_file = "file://" + # allows safely merging together ALL the repositories where files were discovered. remote_candidates = [] - all_remote_urls = set() - all_project_track_urls = [] - all_repo_alt_urls = [] remote_repositories = set() + for candidate in candidates: candidate_name = candidate.name - candidate_canonical_name = canonicalize_name(candidate_name) - - file_path = None - try: - file_path = candidate.link.file_path - except Exception: - pass - url = candidate.link.url - - project_track_urls = candidate.link.project_track_urls or set() - all_project_track_urls.append(project_track_urls) - remote_repositories.update(project_track_urls) - - repo_alt_urls = candidate.link.repo_alt_urls or set() - all_repo_alt_urls.append(repo_alt_urls) - remote_repositories.update(repo_alt_urls) - - other_repo_urls = {*project_track_urls, *repo_alt_urls} - has_other_repo_urls = len(other_repo_urls) > 0 + link = candidate.link + comes_from = link.comes_from - is_local_url = url and url.startswith(scheme_file) - if url and not is_local_url: - all_remote_urls.add(url) + candidate_canonical_name = canonicalize_name(candidate_name) # Specification: Repositories that exist on the local filesystem SHOULD always # be implicitly allowed to be merged to any remote repository. - is_local_file = file_path or is_local_url - has_remote_repo = has_other_repo_urls and any( - not i.startswith(scheme_file) for i in other_repo_urls - ) - if is_local_file and not has_remote_repo: + if link.is_local_only: # Ignore any local candidates in later comparisons. logger.debug( - "Ignore local candidate %s in multiple remote repository checks", + "Ignoring local candidate %s in multiple remote repository checks", candidate, ) continue - remote_candidates.append( - { - "name": candidate_name, - "canonical_name": candidate_canonical_name, - "file_path": file_path, - "url": url, - "project_track_urls": project_track_urls, - "repo_alt_urls": repo_alt_urls, - "other_repo_urls": other_repo_urls, - "is_local_file": is_local_file, - "has_remote_repo": has_remote_repo, - } - ) + try: + page_url = comes_from.url.lstrip() + except AttributeError: + page_url = comes_from.lstrip() + + item = { + "candidate": candidate, + "canonical_name": candidate_canonical_name, + } + remote_candidates.append(item) + + remote_repositories.add(page_url) + remote_repositories.update(link.project_track_urls) + remote_repositories.update(link.repo_alt_urls) # If there are no remote candidates, then allow merging repositories. if len(remote_candidates) == 0: logger.debug("No remote candidates for multiple remote repository checks") return + # Specification: If the project in question only comes from a single repository, + # then there is no chance of dependency confusion, so there’s no reason to do + # anything but allow. + if len(remote_repositories) < 2: + logger.debug( + "No chance for dependency confusion when there is only " + "one remote candidate for multiple remote repository checks" + ) + return + # TODO if logger.isEnabledFor(logging.INFO): logger.info("Remote candidates for multiple remote repository checks:") @@ -1192,10 +1184,11 @@ def check_multiple_remote_repositories( # TODO: This checks the list of Alternate Locations for the candidates that were # retrieved. It does not request the metadata from any additional repositories - # revealed via the lists of Alternate Locations. + # revealed via the lists of Alternate Locations urls or Tracks urls. # Specification: In order for this metadata to be trusted, there MUST be agreement # between all locations where that project is found as to what the alternate # locations are. + all_repo_alt_urls = list(map_alt_urls.keys()) if len(all_repo_alt_urls) > 1: match_alt_locs = set(all_repo_alt_urls[0]) invalid_locations = set() @@ -1204,6 +1197,9 @@ def check_multiple_remote_repositories( match_alt_locs.intersection_update(item) invalid_locations.update(match_alt_locs.symmetric_difference(item)) + logger.debug(match_alt_locs) + logger.debug(invalid_locations) + if len(invalid_locations) > 0: raise InvalidAlternativeLocationsUrl( package=project_name, @@ -1211,29 +1207,49 @@ def check_multiple_remote_repositories( invalid_locations=invalid_locations, ) - # TODO - # Specification: Otherwise [if metadata allows] we merge the namespaces, - # and continue on. - - # TODO - # Specification: It [Tracks metadata] MUST point to the actual URLs for that - # project, not the base URL for the extended repositories. - # raise InvalidTracksUrl - - # TODO - # Specification: It [Tracks metadata] MUST point to the repositories that “own” - # the namespaces, not another repository that is also tracking that namespace. - # raise InvalidTracksUrl + for remote_candidate in remote_candidates: + project_track_urls = remote_candidate.get("candidate").link.project_track_urls + project_track_urls = remote_candidate.get("candidate").link.project_track_urls + page_url = remote_candidate.get("url") + # url_parts = pathlib.Path(urllib.parse.urlsplit(url).path).parts + for project_track_url in project_track_urls: + parts = pathlib.Path(urllib.parse.urlsplit(project_track_url).path).parts + + # Specification: It [Tracks metadata] MUST point to the actual URLs + # for that project, not the base URL for the extended repositories. + # Specification: It [Tracks metadata] MUST point to a project with + # the exact same normalized name. + # Implementation: The normalised project name must be present in one + # of the Tracks url path parts. + # TODO: This assumption about the structure of the url may not hold true + # for all remote repositories. + if not parts or not any( + [canonical_name == canonicalize_name(p) for p in parts] + ): + raise InvalidTracksUrl( + package=project_name, + remote_repositories={page_url}, + invalid_tracks={project_track_url}, + ) - # TODO - # Specification: It [Tracks metadata] MUST point to a project with the exact same - # name (after normalization). - # raise InvalidTracksUrl + # Specification: It [Tracks metadata] MUST point to the repositories + # that “own” the namespaces, not another repository that is also + # tracking that namespace. + # Implementation: An 'owner' repository is one that does not Track the + # same namespace. + # TODO: Without requesting all repositories revealed by metadata, this + # check might pass with incomplete metadata, + # when it would fail with complete metadata. + if project_track_url in map_track_urls: + raise InvalidTracksUrl( + package=project_name, + remote_repositories={page_url}, + invalid_tracks={project_track_url}, + ) # TODO - # Specification: If the project in question only comes from a single repository, - # then there is no chance of dependency confusion, so there’s no reason to do - # anything but allow. + # Specification: Otherwise [if metadata allows] we merge the namespaces, + # and continue on. # Specification: If nothing tells us merging the namespaces is safe, we refuse to # implicitly assume it is, and generate an error instead. diff --git a/src/pip/_internal/models/candidate.py b/src/pip/_internal/models/candidate.py index f27f283154a..5c6dbec6fe5 100644 --- a/src/pip/_internal/models/candidate.py +++ b/src/pip/_internal/models/candidate.py @@ -23,3 +23,21 @@ def __init__(self, name: str, version: str, link: Link) -> None: def __str__(self) -> str: return f"{self.name!r} candidate (version {self.version} at {self.link})" + + +@dataclass(frozen=True) +class RemoteInstallationCandidate: + """Represents a potential "candidate" for installation.""" + + __slots__ = ["canonical_name", "candidate"] + + canonical_name: str + candidate: InstallationCandidate + + def __init__(self, candidate: InstallationCandidate) -> None: + object.__setattr__(self, "name", name) + object.__setattr__(self, "version", parse_version(version)) + object.__setattr__(self, "link", link) + + def __str__(self) -> str: + return f"{self.name!r} candidate (version {self.version} at {self.link})" diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2bdbb827029..78fcbae778f 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -318,6 +318,9 @@ def from_json( elif not yanked_reason: yanked_reason = None + project_track_urls = {i.strip() for i in project_track_urls if i and i.strip()} + repo_alt_urls = {i.strip() for i in repo_alt_urls if i and i.strip()} + return cls( url, comes_from=page_url, @@ -375,6 +378,9 @@ def from_element( ) metadata_file_data = MetadataFile(None) + project_track_urls = {i.strip() for i in project_track_urls if i and i.strip()} + repo_alt_urls = {i.strip() for i in repo_alt_urls if i and i.strip()} + return cls( url, comes_from=page_url, @@ -551,6 +557,28 @@ def is_hash_allowed(self, hashes: Optional[Hashes]) -> bool: return False return any(hashes.is_hash_allowed(k, v) for k, v in self._hashes.items()) + @property + def is_local_only(self): + """ + Is this link entirely local, with no metadata pointing to a remote url? + """ + logger.debug( + { + "is_file": self.is_file, + "file_path": self.file_path if self.is_file else None, + "comes_from": self.comes_from, + "metadata_urls": {*self.project_track_urls, *self.repo_alt_urls}, + } + ) + return ( + (self.is_file and self.file_path) + and not self.comes_from + and not any( + not i.startswith("file:") + for i in {*self.project_track_urls, *self.repo_alt_urls} + ) + ) + class _CleanResult(NamedTuple): """Convert link for equivalency check. diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 45c0f138f71..40ee2fc274f 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -537,13 +537,13 @@ def test_parse_links_json() -> None: ], } ).encode("utf8") + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. url = f"https://example.com/simple-{uuid.uuid4()}/" page = IndexContent( json_bytes, "application/vnd.pypi.simple.v1+json", encoding=None, - # parse_links() is cached by url, so we inject a random uuid to ensure - # the page content isn't cached. url=url, ) links = list(parse_links(page)) @@ -771,13 +771,13 @@ def test_parse_links__alternate_locations_and_tracks( html_path = data.indexes / index_name / package_name / "index.html" html = html_path.read_text() html_bytes = html.encode("utf-8") + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. url = f"https://example.com/simple-{uuid.uuid4()}/" page = IndexContent( html_bytes, "text/html", encoding=None, - # parse_links() is cached by url, so we inject a random uuid to ensure - # the page content isn't cached. url=url, ) links = list(parse_links(page)) diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 884f2f8beec..df9ffbe093a 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -1,11 +1,19 @@ import logging +import uuid from typing import FrozenSet, List, Optional, Set, Tuple import pytest +from pip._internal.models.candidate import InstallationCandidate from pip._vendor.packaging.specifiers import SpecifierSet from pip._vendor.packaging.tags import Tag -from pip._internal.index.collector import LinkCollector +from pip._internal.index.collector import LinkCollector, IndexContent +from pip._internal.exceptions import ( + InvalidAlternativeLocationsUrl, + InvalidMultipleRemoteRepositories, + InvalidTracksUrl, + UnsafeMultipleRemoteRepositories, +) from pip._internal.index.package_finder import ( CandidateEvaluator, CandidatePreferences, @@ -16,6 +24,7 @@ _check_link_requires_python, _extract_version_from_fragment, _find_name_version_sep, + check_multiple_remote_repositories, filter_unallowed_hashes, ) from pip._internal.models.link import Link @@ -903,5 +912,182 @@ def test_extract_version_from_fragment( assert version == expected -class TestCheckMultipleRemoteRepositories: - pass +def _make_mock_candidate_check_remote_repo( + candidate_name: Optional[str] = None, + version: Optional[str] = None, + comes_from_url: Optional[str] = None, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, +) -> InstallationCandidate: + if candidate_name is None: + candidate_name = "mypackage" + + if version is None: + version = "1.0" + + comes_from = None + if comes_from_url is not None: + comes_from = IndexContent( + b"", + "text/html", + encoding=None, + url=comes_from_url, + ) + + url = f"https://example.com/pkg-{version}.tar.gz" + + link = Link( + url, + comes_from=comes_from, + project_track_urls=project_track_urls, + repo_alt_urls=repo_alt_urls, + ) + candidate = InstallationCandidate(candidate_name, version, link) + + return candidate + + +@pytest.mark.parametrize( + "candidates, project_name, expected", + [ + # checks pass when no candidates + ([], "my_package", None), + # checks pass when only one candidate + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://a.example.com/simple/mypackage", + ) + ], + "mypackage", + None, + ), + # checks fail when two candidates with different remotes + # and no metadata to enable merging namespaces + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://a.example.com/simple/mypackage", + ), + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://b.example.com/simple/mypackage", + ), + ], + "mypackage", + UnsafeMultipleRemoteRepositories, + ), + # checks pass when only one candidate with tracks url + # TODO: not making requests to repos revealed via metadata + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://a.example.com/simple/mypackage", + ), + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://b.example.com/simple/mypackage", + project_track_urls={"https://a.example.com/simple/mypackage"}, + ), + ], + "mypackage", + None, + ), + # checks pass when ony one candidate with alt loc url + # TODO: not making requests to repos revealed via metadata + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://a.example.com/simple/mypackage", + ), + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://b.example.com/simple/mypackage", + repo_alt_urls={"https://a.example.com/simple/mypackage"}, + ), + ], + "mypackage", + None, + ), + # checks fail when alternate location urls do not agree + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://a.example.com/simple/mypackage", + repo_alt_urls={"https://b.example.com/simple/mypackage"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://b.example.com/simple/mypackage", + repo_alt_urls={"https://c.example.com/simple/mypackage"}, + ), + ], + "mypackage", + InvalidAlternativeLocationsUrl, + ), + # checks fails when track url points to the base url instead of + # the project url + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://a.example.com/simple/mypackage", + project_track_urls={"https://b.example.com"}, + ), + ], + "mypackage", + InvalidTracksUrl, + ), + # checks fail when track url points to another tracker instead of + # the 'owner' project url + ( + [ + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://a.example.com/simple/mypackage", + project_track_urls={"https://b.example.com/simple/mypackage"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://b.example.com/simple/mypackage", + project_track_urls={"https://c.example.com/simple/mypackage"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://c.example.com/simple/mypackage", + ), + ], + "mypackage", + InvalidTracksUrl, + ), + # checks fail when track url points to different name instead of + # the project url with same name + ( + [ + _make_mock_candidate_check_remote_repo( + candidate_name="othername", + comes_from_url=f"https://a.example.com/simple/othername", + project_track_urls={"https://b.example.com/simple/othername"}, + ), + _make_mock_candidate_check_remote_repo( + comes_from_url=f"https://b.example.com/simple/mypackage", + ), + ], + "mypackage", + InvalidTracksUrl, + ), + # checks pass when track url points to same normalized name + ( + [ + _make_mock_candidate_check_remote_repo( + candidate_name="a.b-c_d", + comes_from_url=f"https://a.example.com/simple/mypackage", + project_track_urls={"https://b.example.com/simple/a_b.c-d"}, + ), + ], + "a-b_c.d", + None, + ), + ], +) +def test_check_multiple_remote_repositories( + caplog, candidates: List[InstallationCandidate], project_name: str, expected +): + caplog.set_level(logging.DEBUG) + if expected: + with pytest.raises(expected): + check_multiple_remote_repositories(candidates, project_name) + else: + assert check_multiple_remote_repositories(candidates, project_name) is None From 5f1c5c64d8c4c638015551e5583a02b1909f2f52 Mon Sep 17 00:00:00 2001 From: Mark Cottman-Fields Date: Thu, 19 Sep 2024 20:02:58 +1000 Subject: [PATCH 5/6] fix lints --- src/pip/_internal/exceptions.py | 15 ++++++----- src/pip/_internal/index/package_finder.py | 2 +- src/pip/_internal/models/link.py | 4 +-- tests/unit/test_collector.py | 6 ++--- tests/unit/test_index.py | 32 +++++++++++------------ 5 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 60f5cff78b8..fd8f594b2b5 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -13,7 +13,7 @@ import re import sys from itertools import chain, groupby, repeat -from typing import TYPE_CHECKING, Dict, Iterator, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Dict, Iterator, List, Literal, Optional, Set, Union from pip._vendor.rich.console import Console, ConsoleOptions, RenderResult from pip._vendor.rich.markup import escape @@ -803,8 +803,8 @@ def __init__( self, *, package: str, - remote_repositories: set[str], - invalid_tracks: set[str], + remote_repositories: Set[str], + invalid_tracks: Set[str], ) -> None: super().__init__( kind="error", @@ -843,8 +843,8 @@ def __init__( self, *, package: str, - remote_repositories: set[str], - invalid_locations: set[str], + remote_repositories: Set[str], + invalid_locations: Set[str], ) -> None: super().__init__( kind="error", @@ -882,13 +882,14 @@ class UnsafeMultipleRemoteRepositories(InvalidMultipleRemoteRepositories): reference = "unsafe-multiple-remote-repositories" - def __init__(self, *, package: str, remote_repositories: set[str]) -> None: + def __init__(self, *, package: str, remote_repositories: Set[str]) -> None: super().__init__( kind="error", message=Text( f"More than one remote repository was found for {escape(package)}, " "with no indication that the remote repositories can be safely merged. " - f"The repositories are {'; '.join(sorted(escape(r) for r in remote_repositories))}." + "The repositories are " + f"{'; '.join(sorted(escape(r) for r in remote_repositories))}." ), context=Text( "Multiple remote repositories are not merged by default " diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 75a588e100f..e4d2ce8b39a 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -1224,7 +1224,7 @@ def check_multiple_remote_repositories( # TODO: This assumption about the structure of the url may not hold true # for all remote repositories. if not parts or not any( - [canonical_name == canonicalize_name(p) for p in parts] + canonical_name == canonicalize_name(p) for p in parts ): raise InvalidTracksUrl( package=project_name, diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 78fcbae778f..309ab87cd1f 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -279,8 +279,8 @@ def from_json( cls, file_data: Dict[str, Any], page_url: str, - project_track_urls: Optional[set[str]] = None, - repo_alt_urls: Optional[set[str]] = None, + project_track_urls: Optional[Set[str]] = None, + repo_alt_urls: Optional[Set[str]] = None, ) -> Optional["Link"]: """ Convert an pypi json document from a simple repository page into a Link. diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index c5d3658fa40..1c7d1ceae42 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -6,7 +6,7 @@ import uuid from pathlib import Path from textwrap import dedent -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Set from unittest import mock import pytest @@ -763,8 +763,8 @@ def test_parse_links_caches_same_page_by_url() -> None: ) def test_parse_links__alternate_locations_and_tracks( index_name: str, - expected_project_track_urls: set[str], - expected_repo_alt_urls: set[str], + expected_project_track_urls: Set[str], + expected_repo_alt_urls: Set[str], data: TestData, ) -> None: package_name = "simple" diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index 4a6719ed1b9..ba9f8b21ac0 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -956,7 +956,7 @@ def _make_mock_candidate_check_remote_repo( ( [ _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", ) ], "mypackage", @@ -967,10 +967,10 @@ def _make_mock_candidate_check_remote_repo( ( [ _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", ), _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://b.example.com/simple/mypackage", + comes_from_url="https://b.example.com/simple/mypackage", ), ], "mypackage", @@ -981,10 +981,10 @@ def _make_mock_candidate_check_remote_repo( ( [ _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", ), _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://b.example.com/simple/mypackage", + comes_from_url="https://b.example.com/simple/mypackage", project_track_urls={"https://a.example.com/simple/mypackage"}, ), ], @@ -996,10 +996,10 @@ def _make_mock_candidate_check_remote_repo( ( [ _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", ), _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://b.example.com/simple/mypackage", + comes_from_url="https://b.example.com/simple/mypackage", repo_alt_urls={"https://a.example.com/simple/mypackage"}, ), ], @@ -1010,11 +1010,11 @@ def _make_mock_candidate_check_remote_repo( ( [ _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", repo_alt_urls={"https://b.example.com/simple/mypackage"}, ), _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://b.example.com/simple/mypackage", + comes_from_url="https://b.example.com/simple/mypackage", repo_alt_urls={"https://c.example.com/simple/mypackage"}, ), ], @@ -1026,7 +1026,7 @@ def _make_mock_candidate_check_remote_repo( ( [ _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", project_track_urls={"https://b.example.com"}, ), ], @@ -1038,15 +1038,15 @@ def _make_mock_candidate_check_remote_repo( ( [ _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", project_track_urls={"https://b.example.com/simple/mypackage"}, ), _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://b.example.com/simple/mypackage", + comes_from_url="https://b.example.com/simple/mypackage", project_track_urls={"https://c.example.com/simple/mypackage"}, ), _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://c.example.com/simple/mypackage", + comes_from_url="https://c.example.com/simple/mypackage", ), ], "mypackage", @@ -1058,11 +1058,11 @@ def _make_mock_candidate_check_remote_repo( [ _make_mock_candidate_check_remote_repo( candidate_name="othername", - comes_from_url=f"https://a.example.com/simple/othername", + comes_from_url="https://a.example.com/simple/othername", project_track_urls={"https://b.example.com/simple/othername"}, ), _make_mock_candidate_check_remote_repo( - comes_from_url=f"https://b.example.com/simple/mypackage", + comes_from_url="https://b.example.com/simple/mypackage", ), ], "mypackage", @@ -1073,7 +1073,7 @@ def _make_mock_candidate_check_remote_repo( [ _make_mock_candidate_check_remote_repo( candidate_name="a.b-c_d", - comes_from_url=f"https://a.example.com/simple/mypackage", + comes_from_url="https://a.example.com/simple/mypackage", project_track_urls={"https://b.example.com/simple/a_b.c-d"}, ), ], From 1f77352a7f0b7a996611ffd470dac78ee3a4457a Mon Sep 17 00:00:00 2001 From: Mark Cottman-Fields Date: Sat, 28 Sep 2024 19:00:05 +1000 Subject: [PATCH 6/6] work in progress --- src/pip/_internal/exceptions.py | 2 +- src/pip/_internal/index/package_finder.py | 181 +++++++++++++--------- src/pip/_internal/models/candidate.py | 56 ++++++- src/pip/_internal/models/link.py | 1 - 4 files changed, 160 insertions(+), 80 deletions(-) diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index fd8f594b2b5..8aac5da072e 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -853,7 +853,7 @@ def __init__( "were different among the remote repositories. " "The remote repositories are " f"{'; '.join(sorted(escape(r) for r in remote_repositories))}." - "The alternate locations not not agreed by all remote " + "The alternate locations not agreed by all remote " "repository are " f"{'; '.join(sorted(escape(r) for r in invalid_locations))}." ), diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index e4d2ce8b39a..fb280453cab 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -26,7 +26,8 @@ UnsupportedWheel, ) from pip._internal.index.collector import LinkCollector, parse_links -from pip._internal.models.candidate import InstallationCandidate +from pip._internal.models.candidate import InstallationCandidate, \ + RemoteInstallationCandidate from pip._internal.models.format_control import FormatControl from pip._internal.models.link import Link from pip._internal.models.search_scope import SearchScope @@ -1044,7 +1045,7 @@ def check_multiple_remote_repositories( logical namespace, while enforcing a more secure default. Returns None if checks pass, otherwise will raise an error with details of the - failed checks. + first failed check. """ # NOTE: The checks in this function must occur after: @@ -1077,6 +1078,7 @@ def check_multiple_remote_repositories( # some names do not are explicitly allowed. # TODO: This requirement doesn't look like something that pip can do anything about. + # How would pip be able to tell whether two urls are the 'same project'? # Specification: All [Tracks metadata] URLs MUST represent the same “project” as # the project in the extending repository. This does not mean that they need to # serve the same files. It is valid for them to include binaries built on @@ -1086,10 +1088,14 @@ def check_multiple_remote_repositories( # the “same” project. # TODO: This requirement doesn't look like something that pip can do anything about. + # Tracks metadata is not required, so anything that uses tracks meta + # must deal with it not being available. # Specification: It [Tracks metadata] is NOT required that every name in a # repository tracks the same repository, or that they all track a repository at all. # TODO: This requirement doesn't look like something that pip can do anything about. + # How would pip be able to tell whether the metadata is under the control of + # the repository operators and not any individual publisher using the repository? # Specification: It [repository Tracks metadata] MUST be under the control of the # repository operators themselves, not any individual publisher using that # repository. “Repository Operator” can also include anyone who managed the overall @@ -1098,7 +1104,9 @@ def check_multiple_remote_repositories( # owns/manages the entire namespace of that repository. # TODO: Consider making requests to repositories revealed via Alternate Locations - # or Tracks metadata, to assess the metadata of those additional repositories. + # or Tracks metadata, to assess the metadata of those additional repositories. + # This would be a fair bit more functionality to include. + # Maybe a subsequent PR, if this functionality is desirable? # Specification: When an installer encounters a project that is using the # alternate locations metadata it SHOULD consider that all repositories named are # extending the same namespace across multiple repositories. @@ -1109,6 +1117,7 @@ def check_multiple_remote_repositories( # TODO: Consider whether pip already has, or should add, ways to indicate exactly # which individual projects to get from exactly which repositories. + # This seems to be separate functionality, that could be added in a separate PR. # Specification: If the user has explicitly told the installer that it wants to # fetch a project from a certain set of repositories, then there is no reason # to question that and we assume that they’ve made sure it is safe to merge @@ -1118,7 +1127,7 @@ def check_multiple_remote_repositories( # When no candidates are provided, then no checks are relevant, so just return. if candidates is None or len(candidates) == 0: logger.debug("No candidates given to multiple remote repository checks") - return + return None # Calculate the canonical name for later comparisons. canonical_name = canonicalize_name(project_name) @@ -1127,14 +1136,19 @@ def check_multiple_remote_repositories( # if they do then determine if either “Tracks” or “Alternate Locations” metadata # allows safely merging together ALL the repositories where files were discovered. remote_candidates = [] - remote_repositories = set() - + # all known remote repositories + known_remote_repo_urls = set() + # all known alternate location urls + known_alternate_urls = set() + # the alternate location urls that are not present in all remote candidates + mismatch_alternate_urls = set() + # all known remote repositories that do not track any other repos + known_owner_repo_urls = set() + # all known remote repositories that do track other repos + known_tracker_repo_urls = set() for candidate in candidates: candidate_name = candidate.name link = candidate.link - comes_from = link.comes_from - - candidate_canonical_name = canonicalize_name(candidate_name) # Specification: Repositories that exist on the local filesystem SHOULD always # be implicitly allowed to be merged to any remote repository. @@ -1146,83 +1160,88 @@ def check_multiple_remote_repositories( ) continue - try: - page_url = comes_from.url.lstrip() - except AttributeError: - page_url = comes_from.lstrip() + remote_candidate = RemoteInstallationCandidate( + canonical_name=canonicalize_name(candidate_name), + candidate=candidate, + ) + remote_candidates.append(remote_candidate) - item = { - "candidate": candidate, - "canonical_name": candidate_canonical_name, - } - remote_candidates.append(item) + # populate the known urls + known_remote_repo_urls.update(remote_candidate.remote_repository_urls) + known_alternate_urls.update(remote_candidate.alternate_location_urls) + if len(remote_candidate.project_track_urls) > 0: + known_tracker_repo_urls.update(remote_candidate.url) + else: + known_owner_repo_urls.update(remote_candidate.url) - remote_repositories.add(page_url) - remote_repositories.update(link.project_track_urls) - remote_repositories.update(link.repo_alt_urls) + # Update the set, keeping only elements found in either set, but not in both. + # This should allow all the items that are not present for all remote candidates + # to be tracked. + mismatch_alternate_urls.symmetric_difference_update( + remote_candidate.alternate_location_urls + ) # If there are no remote candidates, then allow merging repositories. if len(remote_candidates) == 0: logger.debug("No remote candidates for multiple remote repository checks") - return + return None # Specification: If the project in question only comes from a single repository, # then there is no chance of dependency confusion, so there’s no reason to do # anything but allow. - if len(remote_repositories) < 2: + if len(known_remote_repo_urls) == 1: logger.debug( "No chance for dependency confusion when there is only " "one remote candidate for multiple remote repository checks" ) - return - - # TODO - if logger.isEnabledFor(logging.INFO): - logger.info("Remote candidates for multiple remote repository checks:") - for candidate in remote_candidates: - logger.info(candidate) - - # TODO: This checks the list of Alternate Locations for the candidates that were - # retrieved. It does not request the metadata from any additional repositories - # revealed via the lists of Alternate Locations urls or Tracks urls. + return None + if len(known_remote_repo_urls) == 0: + msg = ("Unexpected situation where there are remote candidates, " + "but no remote repositories") + logger.warning(msg) + raise ValueError(msg) + + # This checks the list of Alternate Locations for the candidates that were + # retrieved. Each remote candidate may define a different list of Tracks and + # Alternate Location metadata. + # TODO: The Alternate Locations and Tracks urls revealed by the candidate metadata + # are not requested to check that their metadata matches the candidate metadata. + # This means that the known candidate metadata might agree and pass this check, + # while retrieving the metadata from additional urls would not agree and would + # fail this check. # Specification: In order for this metadata to be trusted, there MUST be agreement # between all locations where that project is found as to what the alternate # locations are. - all_repo_alt_urls = list(map_alt_urls.keys()) - if len(all_repo_alt_urls) > 1: - match_alt_locs = set(all_repo_alt_urls[0]) - invalid_locations = set() - - for item in all_repo_alt_urls[1:]: - match_alt_locs.intersection_update(item) - invalid_locations.update(match_alt_locs.symmetric_difference(item)) - - logger.debug(match_alt_locs) - logger.debug(invalid_locations) - - if len(invalid_locations) > 0: - raise InvalidAlternativeLocationsUrl( - package=project_name, - remote_repositories=remote_repositories, - invalid_locations=invalid_locations, - ) + if len(mismatch_alternate_urls) > 0: + raise InvalidAlternativeLocationsUrl( + package=project_name, + remote_repositories=known_remote_repo_urls, + invalid_locations=mismatch_alternate_urls, + ) + # Check the Tracks metadata. for remote_candidate in remote_candidates: - project_track_urls = remote_candidate.get("candidate").link.project_track_urls - project_track_urls = remote_candidate.get("candidate").link.project_track_urls - page_url = remote_candidate.get("url") - # url_parts = pathlib.Path(urllib.parse.urlsplit(url).path).parts + project_track_urls = remote_candidate.project_track_urls + page_url = remote_candidate.url for project_track_url in project_track_urls: parts = pathlib.Path(urllib.parse.urlsplit(project_track_url).path).parts + parts = list(parts) + parts.reverse() # Specification: It [Tracks metadata] MUST point to the actual URLs # for that project, not the base URL for the extended repositories. + # Implementation Note: Assume that the actual url for the project will + # contain the canonical project name as a path part, and the base URL + # will not contain the canonical project name as a path part. + # Specification: It [Tracks metadata] MUST point to a project with # the exact same normalized name. - # Implementation: The normalised project name must be present in one - # of the Tracks url path parts. - # TODO: This assumption about the structure of the url may not hold true - # for all remote repositories. + # Implementation Note: Assume that projects that are configured to be the + # 'exact same' will have a Tracks url path part with the canonical + # project name, usually the last path part. + + # Note: These assumptions about the structure of the Tracks url may not + # hold true for all remote repositories. if not parts or not any( canonical_name == canonicalize_name(p) for p in parts ): @@ -1235,27 +1254,43 @@ def check_multiple_remote_repositories( # Specification: It [Tracks metadata] MUST point to the repositories # that “own” the namespaces, not another repository that is also # tracking that namespace. - # Implementation: An 'owner' repository is one that does not Track the - # same namespace. + # Implementation Note: An 'owner' repository is one that has no Tracks + # metadata. # TODO: Without requesting all repositories revealed by metadata, this - # check might pass with incomplete metadata, - # when it would fail with complete metadata. - if project_track_url in map_track_urls: + # check might pass with incomplete knowledge of all metadata, + # when it would fail after retrieving all metadata. + if project_track_url not in known_owner_repo_urls: raise InvalidTracksUrl( package=project_name, remote_repositories={page_url}, invalid_tracks={project_track_url}, ) - # TODO - # Specification: Otherwise [if metadata allows] we merge the namespaces, - # and continue on. - # Specification: If nothing tells us merging the namespaces is safe, we refuse to # implicitly assume it is, and generate an error instead. # Specification: If that metadata does NOT allow [merging namespaces], then # generate an error. - error = UnsafeMultipleRemoteRepositories( - package=project_name, remote_repositories=remote_repositories - ) - raise error + # Implementation Note: If there are two or more remote candidates, and any of them + # don't have valid Alternate Locations and/or Tracks metadata, then fail. + for remote_candidate in remote_candidates: + candidate_alt_urls = remote_candidate.alternate_location_urls + + invalid_alt_urls = known_alternate_urls - candidate_alt_urls + has_alts = len(candidate_alt_urls) > 0 + has_tracks = len(remote_candidate.project_track_urls) > 0 + + is_invalid = any([ + not has_alts and not has_tracks, + not has_tracks and invalid_alt_urls, + ]) + + if is_invalid: + error = UnsafeMultipleRemoteRepositories( + package=project_name, + remote_repositories=known_remote_repo_urls, + ) + raise error + + # Specification: Otherwise [if metadata allows] we merge the namespaces, + # and continue on. + return None diff --git a/src/pip/_internal/models/candidate.py b/src/pip/_internal/models/candidate.py index 5c6dbec6fe5..f53b05f20f7 100644 --- a/src/pip/_internal/models/candidate.py +++ b/src/pip/_internal/models/candidate.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from typing import Set, Optional from pip._vendor.packaging.version import Version from pip._vendor.packaging.version import parse as parse_version @@ -34,10 +35,55 @@ class RemoteInstallationCandidate: canonical_name: str candidate: InstallationCandidate - def __init__(self, candidate: InstallationCandidate) -> None: - object.__setattr__(self, "name", name) - object.__setattr__(self, "version", parse_version(version)) - object.__setattr__(self, "link", link) + def __init__(self, canonical_name:str, candidate: InstallationCandidate) -> None: + object.__setattr__(self, "canonical_name", canonical_name) + object.__setattr__(self, "candidate", candidate) def __str__(self) -> str: - return f"{self.name!r} candidate (version {self.version} at {self.link})" + return f"{self.canonical_name!r} candidate" + + @property + def _link(self) -> Optional[Link]: + if not self.candidate: + return None + if not self.candidate.link: + return None + return self.candidate.link + + @property + def url(self) -> Optional[str]: + """The remote url that contains the metadata for this installation candidate.""" + link = self._link + if not link: + return None + if not link.comes_from: + return None + if hasattr(link.comes_from, 'url') and link.comes_from.url: + return self.candidate.link.comes_from.url.lstrip() + if link.comes_from: + return self.candidate.link.comes_from.lstrip() + return None + + + @property + def remote_repository_urls(self) -> Set[str]: + """Remote repository urls from Tracks and Alternate Locations metadata.""" + return {*self.project_track_urls, *self.alternate_location_urls} + + @property + def project_track_urls(self) -> Set[str]: + """Remote repository urls from Tracks metadata.""" + result = {} + link = self._link + if link and link.project_track_urls: + result.update(link.project_track_urls) + return {i for i in result if i} + + @property + def alternate_location_urls(self) -> Set[str]: + """Remote repository urls from Alternate Locations metadata.""" + result = {self.url} + link = self._link + if link and link.repo_alt_urls: + result.update(link.repo_alt_urls) + return {i for i in result if i} diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 309ab87cd1f..9f008a27c7b 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -270,7 +270,6 @@ def __init__( self.cache_link_parsing = cache_link_parsing self.egg_fragment = self._egg_fragment() - # PEP 708 self.project_track_urls = project_track_urls or set() self.repo_alt_urls = repo_alt_urls or set()