From c4a1fc80bd070f4f663b21796548568ed08af41b Mon Sep 17 00:00:00 2001 From: Alyssa Coghlan Date: Wed, 27 Nov 2024 21:54:26 +1000 Subject: [PATCH] Actually test gzip archive generation --- src/venvstacks/pack_venv.py | 37 ++++++++++++++--------- src/venvstacks/stacks.py | 18 ++++++----- tests/test_cli_invocation.py | 30 +++++++++++++++++++ tests/test_minimal_project.py | 56 +++++++++++++++++++++++++++++++++-- 4 files changed, 116 insertions(+), 25 deletions(-) diff --git a/src/venvstacks/pack_venv.py b/src/venvstacks/pack_venv.py index 4d255a4..79e872c 100755 --- a/src/venvstacks/pack_venv.py +++ b/src/venvstacks/pack_venv.py @@ -39,10 +39,12 @@ import tempfile import time +from contextlib import ExitStack from datetime import datetime, timedelta, timezone, tzinfo from enum import StrEnum +from gzip import GzipFile from pathlib import Path -from typing import Any, Callable, cast, Self, TextIO +from typing import Any, Callable, cast, BinaryIO, Self, TextIO from ._injected import postinstall as _default_postinstall from ._util import as_normalized_path, StrPath, WINDOWS_BUILD as _WINDOWS_BUILD @@ -161,7 +163,7 @@ def make_archive( base_dir, max_mtime, progress_callback, - compress=str(self), + compress=self.get_compression(), ) # Not a tar compression format -> emit a zipfile instead return _make_zipfile( @@ -330,10 +332,10 @@ def report_progress(_: Any) -> None: # between the number of paths found by `rglob` and the number of archive entries progress_bar.show(1.0) # The name query and the archive creation should always report the same archive name - assert archive_with_extension == os.fspath( - archive_format.get_archive_path(archive_base_name) - ) - return Path(archive_with_extension) + created_path = Path(archive_with_extension) + expected_path = archive_format.get_archive_path(archive_base_name) + assert created_path == expected_path, f"{created_path} != {expected_path}" + return created_path # Would prefer to use shutil.make_archive, but the way it works doesn't quite fit this case @@ -355,7 +357,7 @@ def _make_tar_archive( ) -> str: """Create a (possibly compressed) tar file from all the files under 'base_dir'. - 'compress' must be "gzip", "bzip2", "xz", or None. + 'compress' must be "gzip", "bzip2", "xz", the empty string, or None. Owner and group info is always set to 0/"root" as per https://reproducible-builds.org/docs/archives/. @@ -431,15 +433,22 @@ def _process_archive_entry(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo: return tarinfo # creating the tarball - tar = tarfile.open(archive_name, tar_mode) - arcname = base_dir - if root_dir is not None: - base_dir = os.path.join(root_dir, base_dir) - try: + with ExitStack() as stack: + if _clamp_mtime is not None and compress == "gzip": + # Zero out the timestamp in the gzip header + storage = cast(BinaryIO, GzipFile(archive_name, mode="w", mtime=0)) + stack.enter_context(storage) + else: + # Either mtime is not being clamped, or there is no time in the file header + storage = None + + tar = tarfile.open(archive_name, tar_mode, fileobj=storage) + stack.enter_context(tar) + arcname = base_dir + if root_dir is not None: + base_dir = os.path.join(root_dir, base_dir) # In Python 3.7+, tar.add inherently adds entries in sorted order tar.add(base_dir, arcname, filter=_process_archive_entry) - finally: - tar.close() if root_dir is not None: archive_name = os.path.abspath(archive_name) diff --git a/src/venvstacks/stacks.py b/src/venvstacks/stacks.py index 368202d..87476c2 100755 --- a/src/venvstacks/stacks.py +++ b/src/venvstacks/stacks.py @@ -708,29 +708,31 @@ def create_archive( ) build_metadata = self.build_metadata archive_base_path = self.archive_base_path - built_archive_path = archive_base_path.parent / build_metadata["archive_name"] + archive_path = archive_base_path.parent / build_metadata["archive_name"] if not self.needs_build: # Already built archive looks OK, so just return the same metadata as last build - print(f"Using previously built archive at {str(built_archive_path)!r}") + print(f"Using previously built archive at {str(archive_path)!r}") previous_metadata = self.archive_metadata assert previous_metadata is not None - return previous_metadata, built_archive_path - if built_archive_path.exists(): - print(f"Removing outdated archive at {str(built_archive_path)!r}") - built_archive_path.unlink() + return previous_metadata, archive_path + if archive_path.exists(): + print(f"Removing outdated archive at {str(archive_path)!r}") + archive_path.unlink() print(f"Creating archive for {str(env_path)!r}") last_locked = self.env_lock.last_locked if work_path is None: # /tmp is likely too small for ML/AI environments work_path = self.env_path.parent - archive_path = pack_venv.create_archive( + built_path = pack_venv.create_archive( env_path, archive_base_path, clamp_mtime=last_locked, work_dir=work_path, install_target=build_metadata["install_target"], + archive_format=self.archive_format, ) - assert built_archive_path == archive_path # pack_venv ensures this is true + # This assertion can be helpful when debugging archive creation changes + assert built_path == archive_path, f"{built_path} != {archive_path}" print(f"Created {str(archive_path)!r} from {str(env_path)!r}") metadata = ArchiveMetadata( diff --git a/tests/test_cli_invocation.py b/tests/test_cli_invocation.py index 25cfa4a..b961aae 100644 --- a/tests/test_cli_invocation.py +++ b/tests/test_cli_invocation.py @@ -474,6 +474,22 @@ def test_index_options( dry_run=True, tag_outputs=True, format=DEFAULT_ARCHIVE_FORMAT ), # publish_artifacts ), + ( + ("--format", "tar.gz"), + dict(lock=False, build=True, publish=True), # select_operations + dict(clean=False, lock=False), # create_environments + dict( + dry_run=True, tag_outputs=False, format=ArchiveFormat.gz + ), # publish_artifacts + ), + ( + ("--format", "zip"), + dict(lock=False, build=True, publish=True), # select_operations + dict(clean=False, lock=False), # create_environments + dict( + dry_run=True, tag_outputs=False, format=ArchiveFormat.zip + ), # publish_artifacts + ), ( ("--lock", "--build", "--publish", "--clean", "--tag-outputs"), dict(lock=True, build=True, publish=True), # select_operations @@ -643,6 +659,20 @@ def test_mock_lock_usage_error( force=False, tag_outputs=True, format=DEFAULT_ARCHIVE_FORMAT ), # publish_artifacts ), + ( + ("--format", "tar.gz"), + dict(lock=False, build=False, publish=True), # select_operations + dict( + force=False, tag_outputs=False, format=ArchiveFormat.gz + ), # publish_artifacts + ), + ( + ("--format", "zip"), + dict(lock=False, build=False, publish=True), # select_operations + dict( + force=False, tag_outputs=False, format=ArchiveFormat.zip + ), # publish_artifacts + ), ( ( "--no-force", diff --git a/tests/test_minimal_project.py b/tests/test_minimal_project.py index f986f5c..406404b 100644 --- a/tests/test_minimal_project.py +++ b/tests/test_minimal_project.py @@ -147,7 +147,7 @@ def _filter_manifest( filtered_summaries: ArchiveSummaries = {} last_locked_times: LastLockedTimes = {} for kind, archive_manifests in manifest["layers"].items(): - filtered_summaries[kind] = summaries = [] + filtered_summaries[str(kind)] = summaries = [] for archive_manifest in archive_manifests: summaries.append(_filter_archive_manifest(archive_manifest)) last_locked_times[archive_manifest["install_target"]] = ( @@ -160,7 +160,7 @@ def _tag_manifest(manifest: BuildManifest, expected_tag: str) -> BuildManifest: """Add expected build tag to fields that are expected to include the build tag.""" tagged_summaries: ArchiveSummaries = {} for kind, summaries in manifest["layers"].items(): - tagged_summaries[kind] = new_summaries = [] + tagged_summaries[str(kind)] = new_summaries = [] for summary in summaries: new_summary = summary.copy() new_summaries.append(new_summary) @@ -172,6 +172,30 @@ def _tag_manifest(manifest: BuildManifest, expected_tag: str) -> BuildManifest: return {"layers": tagged_summaries} +def _replace_archive_suffix( + manifest: BuildManifest, expected_suffix: str, expected_build: int, +) -> BuildManifest: + """Replace the expected file extension in archive names.""" + updated_summaries: ArchiveSummaries = {} + for kind, summaries in manifest["layers"].items(): + updated_summaries[str(kind)] = new_summaries = [] + for summary in summaries: + new_summary = summary.copy() + new_summaries.append(new_summary) + # Replace the expected extension in the archive name + default_name = summary["archive_name"] + updated_name = default_name.replace( + ARCHIVE_SUFFIX, expected_suffix + ) + build_tag = "-1." + if build_tag in updated_name: + updated_tag = f"-{expected_build}." + updated_name = updated_name.replace(build_tag, updated_tag) + new_summary["archive_name"] = updated_name + new_summary["archive_build"] = expected_build + return {"layers": updated_summaries} + + ########################## # Test cases ########################## @@ -496,6 +520,8 @@ def test_create_environments(self) -> None: @pytest.mark.slow def test_locking_and_publishing(self) -> None: + # Need long diffs to get useful output from metadata checks + self.maxDiff = None # This is organised as subtests in a monolothic test sequence to reduce CI overhead # Separating the tests wouldn't really make them independent, unless the outputs of # the earlier steps were checked in for use when testing the later steps. @@ -616,7 +642,31 @@ def test_locking_and_publishing(self) -> None: ) self.check_archive_deployment(tagged_publication_result) subtests_passed += 1 - # TODO: Add another test stage that confirms build versions increment as expected + # Test stage: check overriding the archive format works as expected. + # tar.gz is used, as that isn't the default on any platform. + # This also checks the build numbers increment as expected. + subtests_started += 1 + with self.subTest("Check untagged tar.gz publication"): + gz_dry_run_result = _replace_archive_suffix(dry_run_result, ".tar.gz", 2) + gz_publication_result = build_env.publish_artifacts(format="tar.gz") + self.check_publication_result( + gz_publication_result, gz_dry_run_result, expected_tag=None + ) + self.check_archive_deployment(gz_publication_result) + subtests_passed += 1 + subtests_started += 1 + with self.subTest("Check tagged tar.gz publication"): + gz_tagged_dry_run_result = _replace_archive_suffix( + tagged_dry_run_result, ".tar.gz", 2 + ) + gz_tagged_publication_result = build_env.publish_artifacts( + tag_outputs=True, format="tar.gz" + ) + self.check_publication_result( + gz_tagged_publication_result, gz_tagged_dry_run_result, expected_tag + ) + self.check_archive_deployment(gz_tagged_publication_result) + subtests_passed += 1 # Work aroung pytest-subtests not failing the test case when subtests fail # https://github.com/pytest-dev/pytest-subtests/issues/76