Skip to content

Commit

Permalink
Actually test gzip archive generation
Browse files Browse the repository at this point in the history
  • Loading branch information
ncoghlan committed Nov 27, 2024
1 parent ea38e52 commit c4a1fc8
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 25 deletions.
37 changes: 23 additions & 14 deletions src/venvstacks/pack_venv.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@
import tempfile
import time

from contextlib import ExitStack
from datetime import datetime, timedelta, timezone, tzinfo
from enum import StrEnum
from gzip import GzipFile
from pathlib import Path
from typing import Any, Callable, cast, Self, TextIO
from typing import Any, Callable, cast, BinaryIO, Self, TextIO

from ._injected import postinstall as _default_postinstall
from ._util import as_normalized_path, StrPath, WINDOWS_BUILD as _WINDOWS_BUILD
Expand Down Expand Up @@ -161,7 +163,7 @@ def make_archive(
base_dir,
max_mtime,
progress_callback,
compress=str(self),
compress=self.get_compression(),
)
# Not a tar compression format -> emit a zipfile instead
return _make_zipfile(
Expand Down Expand Up @@ -330,10 +332,10 @@ def report_progress(_: Any) -> None:
# between the number of paths found by `rglob` and the number of archive entries
progress_bar.show(1.0)
# The name query and the archive creation should always report the same archive name
assert archive_with_extension == os.fspath(
archive_format.get_archive_path(archive_base_name)
)
return Path(archive_with_extension)
created_path = Path(archive_with_extension)
expected_path = archive_format.get_archive_path(archive_base_name)
assert created_path == expected_path, f"{created_path} != {expected_path}"
return created_path


# Would prefer to use shutil.make_archive, but the way it works doesn't quite fit this case
Expand All @@ -355,7 +357,7 @@ def _make_tar_archive(
) -> str:
"""Create a (possibly compressed) tar file from all the files under 'base_dir'.
'compress' must be "gzip", "bzip2", "xz", or None.
'compress' must be "gzip", "bzip2", "xz", the empty string, or None.
Owner and group info is always set to 0/"root" as per
https://reproducible-builds.org/docs/archives/.
Expand Down Expand Up @@ -431,15 +433,22 @@ def _process_archive_entry(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
return tarinfo

# creating the tarball
tar = tarfile.open(archive_name, tar_mode)
arcname = base_dir
if root_dir is not None:
base_dir = os.path.join(root_dir, base_dir)
try:
with ExitStack() as stack:
if _clamp_mtime is not None and compress == "gzip":
# Zero out the timestamp in the gzip header
storage = cast(BinaryIO, GzipFile(archive_name, mode="w", mtime=0))
stack.enter_context(storage)
else:
# Either mtime is not being clamped, or there is no time in the file header
storage = None

tar = tarfile.open(archive_name, tar_mode, fileobj=storage)
stack.enter_context(tar)
arcname = base_dir
if root_dir is not None:
base_dir = os.path.join(root_dir, base_dir)
# In Python 3.7+, tar.add inherently adds entries in sorted order
tar.add(base_dir, arcname, filter=_process_archive_entry)
finally:
tar.close()

if root_dir is not None:
archive_name = os.path.abspath(archive_name)
Expand Down
18 changes: 10 additions & 8 deletions src/venvstacks/stacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,29 +708,31 @@ def create_archive(
)
build_metadata = self.build_metadata
archive_base_path = self.archive_base_path
built_archive_path = archive_base_path.parent / build_metadata["archive_name"]
archive_path = archive_base_path.parent / build_metadata["archive_name"]
if not self.needs_build:
# Already built archive looks OK, so just return the same metadata as last build
print(f"Using previously built archive at {str(built_archive_path)!r}")
print(f"Using previously built archive at {str(archive_path)!r}")
previous_metadata = self.archive_metadata
assert previous_metadata is not None
return previous_metadata, built_archive_path
if built_archive_path.exists():
print(f"Removing outdated archive at {str(built_archive_path)!r}")
built_archive_path.unlink()
return previous_metadata, archive_path
if archive_path.exists():
print(f"Removing outdated archive at {str(archive_path)!r}")
archive_path.unlink()
print(f"Creating archive for {str(env_path)!r}")
last_locked = self.env_lock.last_locked
if work_path is None:
# /tmp is likely too small for ML/AI environments
work_path = self.env_path.parent
archive_path = pack_venv.create_archive(
built_path = pack_venv.create_archive(
env_path,
archive_base_path,
clamp_mtime=last_locked,
work_dir=work_path,
install_target=build_metadata["install_target"],
archive_format=self.archive_format,
)
assert built_archive_path == archive_path # pack_venv ensures this is true
# This assertion can be helpful when debugging archive creation changes
assert built_path == archive_path, f"{built_path} != {archive_path}"
print(f"Created {str(archive_path)!r} from {str(env_path)!r}")

metadata = ArchiveMetadata(
Expand Down
30 changes: 30 additions & 0 deletions tests/test_cli_invocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,22 @@ def test_index_options(
dry_run=True, tag_outputs=True, format=DEFAULT_ARCHIVE_FORMAT
), # publish_artifacts
),
(
("--format", "tar.gz"),
dict(lock=False, build=True, publish=True), # select_operations
dict(clean=False, lock=False), # create_environments
dict(
dry_run=True, tag_outputs=False, format=ArchiveFormat.gz
), # publish_artifacts
),
(
("--format", "zip"),
dict(lock=False, build=True, publish=True), # select_operations
dict(clean=False, lock=False), # create_environments
dict(
dry_run=True, tag_outputs=False, format=ArchiveFormat.zip
), # publish_artifacts
),
(
("--lock", "--build", "--publish", "--clean", "--tag-outputs"),
dict(lock=True, build=True, publish=True), # select_operations
Expand Down Expand Up @@ -643,6 +659,20 @@ def test_mock_lock_usage_error(
force=False, tag_outputs=True, format=DEFAULT_ARCHIVE_FORMAT
), # publish_artifacts
),
(
("--format", "tar.gz"),
dict(lock=False, build=False, publish=True), # select_operations
dict(
force=False, tag_outputs=False, format=ArchiveFormat.gz
), # publish_artifacts
),
(
("--format", "zip"),
dict(lock=False, build=False, publish=True), # select_operations
dict(
force=False, tag_outputs=False, format=ArchiveFormat.zip
), # publish_artifacts
),
(
(
"--no-force",
Expand Down
56 changes: 53 additions & 3 deletions tests/test_minimal_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _filter_manifest(
filtered_summaries: ArchiveSummaries = {}
last_locked_times: LastLockedTimes = {}
for kind, archive_manifests in manifest["layers"].items():
filtered_summaries[kind] = summaries = []
filtered_summaries[str(kind)] = summaries = []
for archive_manifest in archive_manifests:
summaries.append(_filter_archive_manifest(archive_manifest))
last_locked_times[archive_manifest["install_target"]] = (
Expand All @@ -160,7 +160,7 @@ def _tag_manifest(manifest: BuildManifest, expected_tag: str) -> BuildManifest:
"""Add expected build tag to fields that are expected to include the build tag."""
tagged_summaries: ArchiveSummaries = {}
for kind, summaries in manifest["layers"].items():
tagged_summaries[kind] = new_summaries = []
tagged_summaries[str(kind)] = new_summaries = []
for summary in summaries:
new_summary = summary.copy()
new_summaries.append(new_summary)
Expand All @@ -172,6 +172,30 @@ def _tag_manifest(manifest: BuildManifest, expected_tag: str) -> BuildManifest:
return {"layers": tagged_summaries}


def _replace_archive_suffix(
manifest: BuildManifest, expected_suffix: str, expected_build: int,
) -> BuildManifest:
"""Replace the expected file extension in archive names."""
updated_summaries: ArchiveSummaries = {}
for kind, summaries in manifest["layers"].items():
updated_summaries[str(kind)] = new_summaries = []
for summary in summaries:
new_summary = summary.copy()
new_summaries.append(new_summary)
# Replace the expected extension in the archive name
default_name = summary["archive_name"]
updated_name = default_name.replace(
ARCHIVE_SUFFIX, expected_suffix
)
build_tag = "-1."
if build_tag in updated_name:
updated_tag = f"-{expected_build}."
updated_name = updated_name.replace(build_tag, updated_tag)
new_summary["archive_name"] = updated_name
new_summary["archive_build"] = expected_build
return {"layers": updated_summaries}


##########################
# Test cases
##########################
Expand Down Expand Up @@ -496,6 +520,8 @@ def test_create_environments(self) -> None:

@pytest.mark.slow
def test_locking_and_publishing(self) -> None:
# Need long diffs to get useful output from metadata checks
self.maxDiff = None
# This is organised as subtests in a monolothic test sequence to reduce CI overhead
# Separating the tests wouldn't really make them independent, unless the outputs of
# the earlier steps were checked in for use when testing the later steps.
Expand Down Expand Up @@ -616,7 +642,31 @@ def test_locking_and_publishing(self) -> None:
)
self.check_archive_deployment(tagged_publication_result)
subtests_passed += 1
# TODO: Add another test stage that confirms build versions increment as expected
# Test stage: check overriding the archive format works as expected.
# tar.gz is used, as that isn't the default on any platform.
# This also checks the build numbers increment as expected.
subtests_started += 1
with self.subTest("Check untagged tar.gz publication"):
gz_dry_run_result = _replace_archive_suffix(dry_run_result, ".tar.gz", 2)
gz_publication_result = build_env.publish_artifacts(format="tar.gz")
self.check_publication_result(
gz_publication_result, gz_dry_run_result, expected_tag=None
)
self.check_archive_deployment(gz_publication_result)
subtests_passed += 1
subtests_started += 1
with self.subTest("Check tagged tar.gz publication"):
gz_tagged_dry_run_result = _replace_archive_suffix(
tagged_dry_run_result, ".tar.gz", 2
)
gz_tagged_publication_result = build_env.publish_artifacts(
tag_outputs=True, format="tar.gz"
)
self.check_publication_result(
gz_tagged_publication_result, gz_tagged_dry_run_result, expected_tag
)
self.check_archive_deployment(gz_tagged_publication_result)
subtests_passed += 1

# Work aroung pytest-subtests not failing the test case when subtests fail
# https://github.com/pytest-dev/pytest-subtests/issues/76
Expand Down

0 comments on commit c4a1fc8

Please sign in to comment.