Actually test gzip archive generation

lmstudio-ai · Nov 27, 2024 · c4a1fc8 · c4a1fc8
1 parent ea38e52
commit c4a1fc8
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 25 deletions.
diff --git a/src/venvstacks/pack_venv.py b/src/venvstacks/pack_venv.py
@@ -39,10 +39,12 @@
 import tempfile
 import time
 
+from contextlib import ExitStack
 from datetime import datetime, timedelta, timezone, tzinfo
 from enum import StrEnum
+from gzip import GzipFile
 from pathlib import Path
-from typing import Any, Callable, cast, Self, TextIO
+from typing import Any, Callable, cast, BinaryIO, Self, TextIO
 
 from ._injected import postinstall as _default_postinstall
 from ._util import as_normalized_path, StrPath, WINDOWS_BUILD as _WINDOWS_BUILD
@@ -161,7 +163,7 @@ def make_archive(
                 base_dir,
                 max_mtime,
                 progress_callback,
-                compress=str(self),
+                compress=self.get_compression(),
             )
         # Not a tar compression format -> emit a zipfile instead
         return _make_zipfile(
@@ -330,10 +332,10 @@ def report_progress(_: Any) -> None:
             # between the number of paths found by `rglob` and the number of archive entries
             progress_bar.show(1.0)
     # The name query and the archive creation should always report the same archive name
-    assert archive_with_extension == os.fspath(
-        archive_format.get_archive_path(archive_base_name)
-    )
-    return Path(archive_with_extension)
+    created_path = Path(archive_with_extension)
+    expected_path = archive_format.get_archive_path(archive_base_name)
+    assert created_path == expected_path, f"{created_path} != {expected_path}"
+    return created_path
 
 
 # Would prefer to use shutil.make_archive, but the way it works doesn't quite fit this case
@@ -355,7 +357,7 @@ def _make_tar_archive(
 ) -> str:
     """Create a (possibly compressed) tar file from all the files under 'base_dir'.
 
-    'compress' must be "gzip", "bzip2", "xz", or None.
+    'compress' must be "gzip", "bzip2", "xz", the empty string, or None.
 
     Owner and group info is always set to 0/"root" as per
     https://reproducible-builds.org/docs/archives/.
@@ -431,15 +433,22 @@ def _process_archive_entry(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
         return tarinfo
 
     # creating the tarball
-    tar = tarfile.open(archive_name, tar_mode)
-    arcname = base_dir
-    if root_dir is not None:
-        base_dir = os.path.join(root_dir, base_dir)
-    try:
+    with ExitStack() as stack:
+        if _clamp_mtime is not None and compress == "gzip":
+            # Zero out the timestamp in the gzip header
+            storage = cast(BinaryIO, GzipFile(archive_name, mode="w", mtime=0))
+            stack.enter_context(storage)
+        else:
+            # Either mtime is not being clamped, or there is no time in the file header
+            storage = None
+
+        tar = tarfile.open(archive_name, tar_mode, fileobj=storage)
+        stack.enter_context(tar)
+        arcname = base_dir
+        if root_dir is not None:
+            base_dir = os.path.join(root_dir, base_dir)
         # In Python 3.7+, tar.add inherently adds entries in sorted order
         tar.add(base_dir, arcname, filter=_process_archive_entry)
-    finally:
-        tar.close()
 
     if root_dir is not None:
         archive_name = os.path.abspath(archive_name)

diff --git a/src/venvstacks/stacks.py b/src/venvstacks/stacks.py
@@ -708,29 +708,31 @@ def create_archive(
             )
         build_metadata = self.build_metadata
         archive_base_path = self.archive_base_path
-        built_archive_path = archive_base_path.parent / build_metadata["archive_name"]
+        archive_path = archive_base_path.parent / build_metadata["archive_name"]
         if not self.needs_build:
             # Already built archive looks OK, so just return the same metadata as last build
-            print(f"Using previously built archive at {str(built_archive_path)!r}")
+            print(f"Using previously built archive at {str(archive_path)!r}")
             previous_metadata = self.archive_metadata
             assert previous_metadata is not None
-            return previous_metadata, built_archive_path
-        if built_archive_path.exists():
-            print(f"Removing outdated archive at {str(built_archive_path)!r}")
-            built_archive_path.unlink()
+            return previous_metadata, archive_path
+        if archive_path.exists():
+            print(f"Removing outdated archive at {str(archive_path)!r}")
+            archive_path.unlink()
         print(f"Creating archive for {str(env_path)!r}")
         last_locked = self.env_lock.last_locked
         if work_path is None:
             # /tmp is likely too small for ML/AI environments
             work_path = self.env_path.parent
-        archive_path = pack_venv.create_archive(
+        built_path = pack_venv.create_archive(
             env_path,
             archive_base_path,
             clamp_mtime=last_locked,
             work_dir=work_path,
             install_target=build_metadata["install_target"],
+            archive_format=self.archive_format,
         )
-        assert built_archive_path == archive_path  # pack_venv ensures this is true
+        # This assertion can be helpful when debugging archive creation changes
+        assert built_path == archive_path, f"{built_path} != {archive_path}"
         print(f"Created {str(archive_path)!r} from {str(env_path)!r}")
 
         metadata = ArchiveMetadata(

diff --git a/tests/test_cli_invocation.py b/tests/test_cli_invocation.py
@@ -474,6 +474,22 @@ def test_index_options(
                 dry_run=True, tag_outputs=True, format=DEFAULT_ARCHIVE_FORMAT
             ),  # publish_artifacts
         ),
+        (
+            ("--format", "tar.gz"),
+            dict(lock=False, build=True, publish=True),  # select_operations
+            dict(clean=False, lock=False),  # create_environments
+            dict(
+                dry_run=True, tag_outputs=False, format=ArchiveFormat.gz
+            ),  # publish_artifacts
+        ),
+        (
+            ("--format", "zip"),
+            dict(lock=False, build=True, publish=True),  # select_operations
+            dict(clean=False, lock=False),  # create_environments
+            dict(
+                dry_run=True, tag_outputs=False, format=ArchiveFormat.zip
+            ),  # publish_artifacts
+        ),
         (
             ("--lock", "--build", "--publish", "--clean", "--tag-outputs"),
             dict(lock=True, build=True, publish=True),  # select_operations
@@ -643,6 +659,20 @@ def test_mock_lock_usage_error(
                 force=False, tag_outputs=True, format=DEFAULT_ARCHIVE_FORMAT
             ),  # publish_artifacts
         ),
+        (
+            ("--format", "tar.gz"),
+            dict(lock=False, build=False, publish=True),  # select_operations
+            dict(
+                force=False, tag_outputs=False, format=ArchiveFormat.gz
+            ),  # publish_artifacts
+        ),
+        (
+            ("--format", "zip"),
+            dict(lock=False, build=False, publish=True),  # select_operations
+            dict(
+                force=False, tag_outputs=False, format=ArchiveFormat.zip
+            ),  # publish_artifacts
+        ),
         (
             (
                 "--no-force",

diff --git a/tests/test_minimal_project.py b/tests/test_minimal_project.py
@@ -147,7 +147,7 @@ def _filter_manifest(
     filtered_summaries: ArchiveSummaries = {}
     last_locked_times: LastLockedTimes = {}
     for kind, archive_manifests in manifest["layers"].items():
-        filtered_summaries[kind] = summaries = []
+        filtered_summaries[str(kind)] = summaries = []
         for archive_manifest in archive_manifests:
             summaries.append(_filter_archive_manifest(archive_manifest))
             last_locked_times[archive_manifest["install_target"]] = (
@@ -160,7 +160,7 @@ def _tag_manifest(manifest: BuildManifest, expected_tag: str) -> BuildManifest:
     """Add expected build tag to fields that are expected to include the build tag."""
     tagged_summaries: ArchiveSummaries = {}
     for kind, summaries in manifest["layers"].items():
-        tagged_summaries[kind] = new_summaries = []
+        tagged_summaries[str(kind)] = new_summaries = []
         for summary in summaries:
             new_summary = summary.copy()
             new_summaries.append(new_summary)
@@ -172,6 +172,30 @@ def _tag_manifest(manifest: BuildManifest, expected_tag: str) -> BuildManifest:
     return {"layers": tagged_summaries}
 
 
+def _replace_archive_suffix(
+    manifest: BuildManifest, expected_suffix: str, expected_build: int,
+) -> BuildManifest:
+    """Replace the expected file extension in archive names."""
+    updated_summaries: ArchiveSummaries = {}
+    for kind, summaries in manifest["layers"].items():
+        updated_summaries[str(kind)] = new_summaries = []
+        for summary in summaries:
+            new_summary = summary.copy()
+            new_summaries.append(new_summary)
+            # Replace the expected extension in the archive name
+            default_name = summary["archive_name"]
+            updated_name = default_name.replace(
+                ARCHIVE_SUFFIX, expected_suffix
+            )
+            build_tag = "-1."
+            if build_tag in updated_name:
+                updated_tag = f"-{expected_build}."
+                updated_name = updated_name.replace(build_tag, updated_tag)
+            new_summary["archive_name"] = updated_name
+            new_summary["archive_build"] = expected_build
+    return {"layers": updated_summaries}
+
+
 ##########################
 # Test cases
 ##########################
@@ -496,6 +520,8 @@ def test_create_environments(self) -> None:
 
     @pytest.mark.slow
     def test_locking_and_publishing(self) -> None:
+        # Need long diffs to get useful output from metadata checks
+        self.maxDiff = None
         # This is organised as subtests in a monolothic test sequence to reduce CI overhead
         # Separating the tests wouldn't really make them independent, unless the outputs of
         # the earlier steps were checked in for use when testing the later steps.
@@ -616,7 +642,31 @@ def test_locking_and_publishing(self) -> None:
             )
             self.check_archive_deployment(tagged_publication_result)
             subtests_passed += 1
-        # TODO: Add another test stage that confirms build versions increment as expected
+        # Test stage: check overriding the archive format works as expected.
+        #             tar.gz is used, as that isn't the default on any platform.
+        #             This also checks the build numbers increment as expected.
+        subtests_started += 1
+        with self.subTest("Check untagged tar.gz publication"):
+            gz_dry_run_result = _replace_archive_suffix(dry_run_result, ".tar.gz", 2)
+            gz_publication_result = build_env.publish_artifacts(format="tar.gz")
+            self.check_publication_result(
+                gz_publication_result, gz_dry_run_result, expected_tag=None
+            )
+            self.check_archive_deployment(gz_publication_result)
+            subtests_passed += 1
+        subtests_started += 1
+        with self.subTest("Check tagged tar.gz publication"):
+            gz_tagged_dry_run_result = _replace_archive_suffix(
+                tagged_dry_run_result, ".tar.gz", 2
+            )
+            gz_tagged_publication_result = build_env.publish_artifacts(
+                tag_outputs=True, format="tar.gz"
+            )
+            self.check_publication_result(
+                gz_tagged_publication_result, gz_tagged_dry_run_result, expected_tag
+            )
+            self.check_archive_deployment(gz_tagged_publication_result)
+            subtests_passed += 1
 
         # Work aroung pytest-subtests not failing the test case when subtests fail
         # https://github.com/pytest-dev/pytest-subtests/issues/76