From 705346f284b00954536212dc57f184aaf0915068 Mon Sep 17 00:00:00 2001 From: Nathan Hui Date: Mon, 24 Jul 2023 21:23:24 -0700 Subject: [PATCH 1/4] wip: Creates zip file --- e4e_data_management/data.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/e4e_data_management/data.py b/e4e_data_management/data.py index cd23381..bc89d2c 100644 --- a/e4e_data_management/data.py +++ b/e4e_data_management/data.py @@ -6,6 +6,7 @@ import json import logging import pickle +import zipfile from dataclasses import dataclass from hashlib import sha256 from pathlib import Path @@ -513,3 +514,15 @@ def commit(self) -> List[Path]: committed_files.extend(new_files) self.staged_files = [] return committed_files + + def create_zip(self, zip_path: Path) -> None: + """Creates a .zip archive of this Dataset at the specified location + + Args: + zip_path (Path): Path to .zip archive + """ + if zip_path.suffix.lower() != '.zip': + raise RuntimeError('Invalid suffix') + + with zipfile.ZipFile(file=zip_path, mode='w') as handle: + pass \ No newline at end of file From 7c920b1fc836cb04455f5567fb6436fd3d777f86 Mon Sep 17 00:00:00 2001 From: Nathan Hui Date: Sun, 15 Dec 2024 19:01:56 -0800 Subject: [PATCH 2/4] style: Fixes spaces and unused variables --- e4e_data_management/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/e4e_data_management/data.py b/e4e_data_management/data.py index 2c4d6e5..6953893 100644 --- a/e4e_data_management/data.py +++ b/e4e_data_management/data.py @@ -516,7 +516,7 @@ def commit(self) -> List[Path]: committed_files.extend(new_files) self.staged_files = [] return committed_files - + def create_zip(self, zip_path: Path) -> None: """Creates a .zip archive of this Dataset at the specified location @@ -525,6 +525,6 @@ def create_zip(self, zip_path: Path) -> None: """ if zip_path.suffix.lower() != '.zip': raise RuntimeError('Invalid suffix') - - with zipfile.ZipFile(file=zip_path, mode='w') as handle: - pass \ No newline at end of file + + with zipfile.ZipFile(file=zip_path, mode='w') as _: + pass From 6cdc42b7e60853efa9a50aa2fc383bb043740279 Mon Sep 17 00:00:00 2001 From: Nathan Hui Date: Sun, 15 Dec 2024 19:58:39 -0800 Subject: [PATCH 3/4] feat: Adds zip --- e4e_data_management/core.py | 27 ++++++------------ e4e_data_management/data.py | 42 +++++++++++++++++++++++++--- e4e_data_management/exception.py | 28 +++++++++++++++++++ tests/test_zip.py | 47 ++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 22 deletions(-) create mode 100644 e4e_data_management/exception.py create mode 100644 tests/test_zip.py diff --git a/e4e_data_management/core.py b/e4e_data_management/core.py index dd852b9..783807c 100644 --- a/e4e_data_management/core.py +++ b/e4e_data_management/core.py @@ -282,24 +282,7 @@ def push(self, path: Path) -> None: Args: path (Path): Destination to push completed dataset to """ - if any(len(mission.staged_files) != 0 - for mission in self.active_dataset.missions.values()) or \ - len(self.active_dataset.staged_files) != 0: - raise RuntimeError('Files still in staging') - - # Check that the README is present - readmes = [file - for file in list(self.active_dataset.root.glob('*')) - if re.match(fnmatch.translate('readme.*'), file.name, re.IGNORECASE)] - - if len(readmes) == 0: - raise RuntimeError('Readme not found') - acceptable_exts = ['.md', '.docx'] - if not any(readme.suffix.lower() in acceptable_exts for readme in readmes): - raise RuntimeError('Illegal README format') - - # validate self - self.active_dataset.validate() + self.active_dataset.check_complete() # Duplicate to destination destination = path.joinpath(self.active_dataset.name) @@ -317,6 +300,14 @@ def zip(self, output_path: Path) -> None: Args: output_path (Path): Output path """ + if output_path.suffix.lower() != '.zip': + output_path = output_path.joinpath( + self.active_dataset.name + '.zip') + + output_path.parent.mkdir(parents=True, exist_ok=True) + self.active_dataset.check_complete() + + self.active_dataset.create_zip(output_path) def unzip(self, input_file: Path, output_path: Path) -> None: """This will unzip the archived dataset to the specified root diff --git a/e4e_data_management/data.py b/e4e_data_management/data.py index 6953893..f77bd80 100644 --- a/e4e_data_management/data.py +++ b/e4e_data_management/data.py @@ -1,7 +1,8 @@ '''Data classes ''' from __future__ import annotations - +import re +import fnmatch import datetime as dt import json import logging @@ -15,7 +16,7 @@ Union) from e4e_data_management.metadata import Metadata - +from e4e_data_management.exception import MissionFilesInStaging, ReadmeFilesInStaging, ReadmeNotFound, CorruptedDataset @dataclass class StagedFile: @@ -526,5 +527,38 @@ def create_zip(self, zip_path: Path) -> None: if zip_path.suffix.lower() != '.zip': raise RuntimeError('Invalid suffix') - with zipfile.ZipFile(file=zip_path, mode='w') as _: - pass + with zipfile.ZipFile(file=zip_path, mode='w') as handle: + manifest = self.manifest.get_dict() + for file in manifest: + src_path = self.root.joinpath(file) + dest = Path(self.name) / file + handle.write(filename=src_path, arcname=dest) + + def check_complete(self) -> None: + """Checks if the dataset is complete + + Raises: + MissionFilesInStaging: Mission files remain in staging + ReadmeFilesInStaging: Readme files remain in staging + ReadmeNotFound: Readme files not found + ReadmeNotFound: Readme files with acceptable extension not found + CorruptedDataset: Dataset checksum validation failed + """ + staged_mission_files = (mission.staged_files + for mission in self.missions.values()) + if any(len(staged) for staged in staged_mission_files): + raise MissionFilesInStaging + if len(self.staged_files) != 0: + raise ReadmeFilesInStaging + + readmes = [file for file in self.root.glob('*') + if re.match(fnmatch.translate('readme.*'), file.name, re.IGNORECASE)] + if len(readmes) == 0: + raise ReadmeNotFound + + acceptable_exts = ['.md', '.docx'] + if not any(readme.suffix.lower() in acceptable_exts for readme in readmes): + raise ReadmeNotFound('Acceptable extension not found') + + if not self.validate(): + raise CorruptedDataset diff --git a/e4e_data_management/exception.py b/e4e_data_management/exception.py new file mode 100644 index 0000000..1987066 --- /dev/null +++ b/e4e_data_management/exception.py @@ -0,0 +1,28 @@ +'''E4E Data Management Exceptions +''' +from abc import ABC + + +class Incomplete(Exception, ABC): + """Dataset not complete + """ + + +class MissionFilesInStaging(Incomplete): + """Mission files still in staging area + """ + + +class ReadmeFilesInStaging(Incomplete): + """Readme files still in staging area + """ + + +class ReadmeNotFound(Incomplete): + """Readme files not found + """ + + +class CorruptedDataset(Exception): + """Corrupted Dataset + """ diff --git a/tests/test_zip.py b/tests/test_zip.py new file mode 100644 index 0000000..13a5f27 --- /dev/null +++ b/tests/test_zip.py @@ -0,0 +1,47 @@ +'''Tests zipping +''' +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Tuple +from unittest.mock import Mock +import zipfile +from e4e_data_management.core import DataManager + +SingleMissionFixture = Tuple[Tuple[Mock, + DataManager, Path], Tuple[Path, int, int]] + + +def test_zip_to_dir(single_mission_data: SingleMissionFixture, + test_readme: Path): + """Tests zipping data + + Args: + single_mission(SingleMissionFixture): Single Mission test fixture + test_readme (Path): Test Readme + """ + test_app, _ = single_mission_data + _, app, _ = test_app + + app.add([test_readme], readme=True) + app.commit(readme=True) + with TemporaryDirectory() as target_dir: + zip_path = Path(target_dir) + app.zip(zip_path) + + final_path = zip_path.joinpath(app.active_dataset.name + '.zip') + assert final_path.is_file() + + with zipfile.ZipFile(file=final_path, mode='r') as handle: + assert handle.testzip() is None + manifest = app.active_dataset.manifest.get_dict() + for name in handle.filelist: + ar_name = Path(name.filename).relative_to( + app.active_dataset.name) + assert ar_name.as_posix() in manifest + + handle.extractall(target_dir) + + app.active_dataset.manifest.validate( + manifest=manifest, + files=Path(app.active_dataset.name).rglob('*') + ) From c3bead9a2cec33096e4f593b37611da35aceb727 Mon Sep 17 00:00:00 2001 From: Nathan Hui Date: Sun, 15 Dec 2024 19:59:52 -0800 Subject: [PATCH 4/4] style: Fixes styling --- e4e_data_management/core.py | 2 -- e4e_data_management/data.py | 11 ++++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/e4e_data_management/core.py b/e4e_data_management/core.py index 783807c..08ebb59 100644 --- a/e4e_data_management/core.py +++ b/e4e_data_management/core.py @@ -3,10 +3,8 @@ from __future__ import annotations import datetime as dt -import fnmatch import logging import pickle -import re from pathlib import Path from shutil import copy2, rmtree from typing import Dict, Iterable, List, Optional, Set diff --git a/e4e_data_management/data.py b/e4e_data_management/data.py index f77bd80..82b0917 100644 --- a/e4e_data_management/data.py +++ b/e4e_data_management/data.py @@ -1,12 +1,13 @@ '''Data classes ''' from __future__ import annotations -import re -import fnmatch + import datetime as dt +import fnmatch import json import logging import pickle +import re import zipfile from dataclasses import dataclass from hashlib import sha256 @@ -15,8 +16,12 @@ from typing import (Callable, Dict, Generator, Iterable, List, Optional, Set, Union) +from e4e_data_management.exception import (CorruptedDataset, + MissionFilesInStaging, + ReadmeFilesInStaging, + ReadmeNotFound) from e4e_data_management.metadata import Metadata -from e4e_data_management.exception import MissionFilesInStaging, ReadmeFilesInStaging, ReadmeNotFound, CorruptedDataset + @dataclass class StagedFile: