From 8be7e887823b0240766566ebbef03fb966ef9200 Mon Sep 17 00:00:00 2001 From: "Thomas J. Zajac" Date: Mon, 30 Sep 2024 10:27:10 +0000 Subject: [PATCH 01/10] Code + readme changes --- .pyproject_generation/pyproject_custom.toml | 2 +- README.md | 25 ++++++----- pyproject.toml | 2 +- src/ghga_datasteward_kit/file_ingest.py | 5 +++ src/ghga_datasteward_kit/models.py | 41 ++++++++++++++++++- .../s3_upload/entrypoint.py | 2 + 6 files changed, 63 insertions(+), 14 deletions(-) diff --git a/.pyproject_generation/pyproject_custom.toml b/.pyproject_generation/pyproject_custom.toml index f2b76ab..d26ea5c 100644 --- a/.pyproject_generation/pyproject_custom.toml +++ b/.pyproject_generation/pyproject_custom.toml @@ -1,6 +1,6 @@ [project] name = "ghga_datasteward_kit" -version = "4.3.0" +version = "4.4.0" description = "GHGA Data Steward Kit - A utils package for GHGA data stewards." dependencies = [ "crypt4gh >=1.6, <2", diff --git a/README.md b/README.md index f511436..acc3109 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,8 @@ This is achieved using the data steward kit, using the following steps: storage is done in one go. This is achieved using either the `ghga-datasteward-kit files upload` for uploading a single file or the `ghga-datasteward-kit files batch-upload` for uploading multiple files at once. + There also exist legacy versions of these subcommands for compatibility reasons, + where the commonad is prefixed with `legacy-`. Please see [this section](#files-batch-upload) for further details. This will output one summary JSON per uploaded file. The encryption secret is automatically transferred to GHGA central. @@ -120,11 +122,9 @@ content to a (remote) S3-compatible object storage. This process consists of multiple steps: 1. Generate a unique file id 2. Create unencrypted file checksum -3. Encrypt file -4. Extract file secret and remove Crypt4GH envelope -5. Upload encrypted file content -6. Download encrypted file content, decrypt and verify checksum -7. Write file/upload information to output file +3. Encrypt and upload file in chunks +4. Download encrypted file content, decrypt and verify checksum +5. Write file/upload information to output file The user needs to provide a config yaml containing information as described [here](./s3_upload_config.md). @@ -135,11 +135,13 @@ An overview of important information about each the upload is written to a file It contains the following information: 1. The file alias 2. A unique identifier for the file -3. The local file path -4. A SHA256 checksum over the unencrypted content -5. MD5 checksums over all encrypted file parts -6. SHA256 checksums over all encrypted file parts -7. The file encryption/decryption secret +3. An identifier of the storage bucket the file was uploaded to (Added in v4.4.0) +4. The local file path +5. A SHA256 checksum over the unencrypted content +6. MD5 checksums over all encrypted file parts +7. SHA256 checksums over all encrypted file parts +8. The file encryption/decryption secret id or the actual textual representation of the secret, if the legacy command was used (Secret ID since v1.0.0) +9. An alias for the storage node the file was uploaded to (Added in v3.0.0) Attention: Keep this output file in a safe, private location. If this file is lost, the uploaded file content becomes inaccessible. @@ -154,6 +156,9 @@ running system and make the corresponding files available for download. This command requires a configuration file as described [here](./ingest_config.md). +### ingest version compatibility +Currently v4.4.0 of this tool and v3.2.0 of the `File Ingest Service` are compatible. + ### metadata *To be performed by Central Data Stewards only.* diff --git a/pyproject.toml b/pyproject.toml index 87657b1..82ed496 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Intended Audience :: Developers", ] name = "ghga_datasteward_kit" -version = "4.3.0" +version = "4.4.0" description = "GHGA Data Steward Kit - A utils package for GHGA data stewards." dependencies = [ "crypt4gh >=1.6, <2", diff --git a/src/ghga_datasteward_kit/file_ingest.py b/src/ghga_datasteward_kit/file_ingest.py index cdc10cc..61f2d1f 100644 --- a/src/ghga_datasteward_kit/file_ingest.py +++ b/src/ghga_datasteward_kit/file_ingest.py @@ -14,6 +14,7 @@ # limitations under the License. """Interaction with file ingest service""" +import logging from collections.abc import Callable from pathlib import Path @@ -26,6 +27,8 @@ from ghga_datasteward_kit import models, utils +LOG = logging.getLogger(__name__) + class IngestConfig(SubmissionStoreConfig): """Config options for calling the file ingest endpoint""" @@ -140,11 +143,13 @@ def file_ingest( input_path=in_path, selected_alias=config.selected_storage_alias ) endpoint = config.file_ingest_federated_endpoint + LOG.info("Selected non-legacy endpoint %s for file %s.", endpoint, in_path) except (KeyError, ValidationError): output_metadata = models.LegacyOutputMetadata.load( input_path=in_path, selected_alias=config.selected_storage_alias ) endpoint = config.file_ingest_legacy_endpoint + LOG.info("Selected legacy endpoint %s for file %s.", endpoint, in_path) endpoint_url = utils.path_join(config.file_ingest_baseurl, endpoint) diff --git a/src/ghga_datasteward_kit/models.py b/src/ghga_datasteward_kit/models.py index b57a729..236f4ba 100644 --- a/src/ghga_datasteward_kit/models.py +++ b/src/ghga_datasteward_kit/models.py @@ -16,6 +16,7 @@ import hashlib import json +import logging import os from pathlib import Path from typing import Any @@ -23,6 +24,8 @@ from ghga_service_commons.utils.crypt import encrypt from pydantic import BaseModel +LOG = logging.getLogger(__name__) + class Checksums: """Container for checksum calculation""" @@ -68,6 +71,7 @@ class MetadataBase(BaseModel): """Common base for all output and upload models""" file_id: str + bucket_id: str | None object_id: str part_size: int unencrypted_size: int @@ -81,6 +85,7 @@ def prepare_output(self) -> dict[str, str]: """Prepare shared fields for output""" output: dict[str, Any] = {} + output["Bucket ID"] = self.bucket_id output["File UUID"] = self.file_id output["Part Size"] = f"{self.part_size // 1024**2} MiB" output["Unencrypted file size"] = self.unencrypted_size @@ -129,12 +134,26 @@ def load(cls, input_path: Path, selected_alias: str): with input_path.open("r") as infile: data = json.load(infile) - # Support for older file uploads without explicit storage alias + # Support for older file uploads without explicit storage alias or bucket id # Ingest the configured selected alias if none can be found in the metadata try: storage_alias = data["Storage alias"] except KeyError: + LOG.warning( + "Could not find storage alias in metadata, populating with configured alias '%s' instead.", + selected_alias, + ) storage_alias = selected_alias + try: + bucket_id = data["Bucket ID"] + except KeyError: + LOG.warning( + "Could not find bucket ID in metadata. Configure the selected_bucket_id option in" + " the file ingest service to populate older metadata with the correct value." + "Output metadata for different buckets needs to be split into different batches in this case." + ) + bucket_id = None + file_id = data["File UUID"] part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2 @@ -142,6 +161,7 @@ def load(cls, input_path: Path, selected_alias: str): alias=data["Alias"], original_path=data["Original filesystem path"], file_id=file_id, + bucket_id=bucket_id, object_id=file_id, part_size=part_size, secret_id=data["Symmetric file encryption secret ID"], @@ -157,6 +177,7 @@ def to_upload_metadata(self, file_id: str): """Convert internal output file representation to request model""" return Metadata( file_id=file_id, + bucket_id=self.bucket_id, object_id=self.object_id, part_size=self.part_size, unencrypted_size=self.unencrypted_size, @@ -210,12 +231,26 @@ def load(cls, input_path: Path, selected_alias: str): with input_path.open("r") as infile: data = json.load(infile) - # Support for older file uploads without explicit storage alias + # Support for older file uploads without explicit storage alias or bucket id # Ingest the configured selected alias if none can be found in the metadata try: storage_alias = data["Storage alias"] except KeyError: + LOG.warning( + "Could not find storage alias in metadata, populating with configured alias '%s' instead.", + selected_alias, + ) storage_alias = selected_alias + try: + bucket_id = data["Bucket ID"] + except KeyError: + LOG.warning( + "Could not find bucket ID in metadata. Configure the selected_bucket_id option in" + " the file ingest service to populate older metadata with the correct value." + "Output metadata for different buckets needs to be split into different batches in this case." + ) + bucket_id = None + file_id = data["File UUID"] part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2 @@ -223,6 +258,7 @@ def load(cls, input_path: Path, selected_alias: str): alias=data["Alias"], original_path=data["Original filesystem path"], file_id=file_id, + bucket_id=bucket_id, object_id=file_id, part_size=part_size, file_secret=data["Symmetric file encryption secret"], @@ -238,6 +274,7 @@ def to_upload_metadata(self, file_id: str): """Convert internal output file representation to request model""" return LegacyMetadata( file_id=file_id, + bucket_id=self.bucket_id, object_id=self.object_id, part_size=self.part_size, unencrypted_size=self.unencrypted_size, diff --git a/src/ghga_datasteward_kit/s3_upload/entrypoint.py b/src/ghga_datasteward_kit/s3_upload/entrypoint.py index 1aaacca..aadabd1 100755 --- a/src/ghga_datasteward_kit/s3_upload/entrypoint.py +++ b/src/ghga_datasteward_kit/s3_upload/entrypoint.py @@ -163,6 +163,7 @@ async def async_main(input_path: Path, alias: str, config: Config, token: str): metadata = models.OutputMetadata( alias=uploader.alias, file_id=uploader.file_id, + bucket_id=get_bucket_id(config=config), object_id=uploader.file_id, original_path=input_path, part_size=config.part_size, @@ -213,6 +214,7 @@ async def legacy_async_main(input_path: Path, alias: str, config: LegacyConfig): metadata = models.LegacyOutputMetadata( alias=uploader.alias, file_id=uploader.file_id, + bucket_id=get_bucket_id(config=config), object_id=uploader.file_id, original_path=input_path, part_size=config.part_size, From c04b76c853201055cfc699bf2afb0f342e068532 Mon Sep 17 00:00:00 2001 From: "Thomas J. Zajac" Date: Mon, 30 Sep 2024 11:07:11 +0000 Subject: [PATCH 02/10] User response detail to differentiate between decryption and missing bucket information issues --- src/ghga_datasteward_kit/file_ingest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ghga_datasteward_kit/file_ingest.py b/src/ghga_datasteward_kit/file_ingest.py index 61f2d1f..115de6c 100644 --- a/src/ghga_datasteward_kit/file_ingest.py +++ b/src/ghga_datasteward_kit/file_ingest.py @@ -179,7 +179,7 @@ def file_ingest( if response.status_code == 403: raise ValueError("Not authorized to access ingest endpoint.") if response.status_code == 422: - raise ValueError("Could not decrypt received payload.") + raise ValueError(response.json()["detail"]) if response.status_code == 500: raise ValueError( "Internal file ingest service error or communication with vault failed." From ce14cf79753d74fbcd3575c38ac9039cb70a6ade Mon Sep 17 00:00:00 2001 From: "Thomas J. Zajac" Date: Mon, 30 Sep 2024 12:05:35 +0000 Subject: [PATCH 03/10] No optional bucket id --- README.md | 4 ++-- src/ghga_datasteward_kit/file_ingest.py | 12 ++++++++++-- src/ghga_datasteward_kit/models.py | 20 +++++++++----------- src/ghga_datasteward_kit/s3_upload/config.py | 3 --- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index acc3109..b04d4e5 100644 --- a/README.md +++ b/README.md @@ -156,8 +156,8 @@ running system and make the corresponding files available for download. This command requires a configuration file as described [here](./ingest_config.md). -### ingest version compatibility -Currently v4.4.0 of this tool and v3.2.0 of the `File Ingest Service` are compatible. +#### ingest version compatibility +Currently v4.4.0 of this tool and v4.0.0 of the `File Ingest Service` are compatible. ### metadata diff --git a/src/ghga_datasteward_kit/file_ingest.py b/src/ghga_datasteward_kit/file_ingest.py index 115de6c..214c9ed 100644 --- a/src/ghga_datasteward_kit/file_ingest.py +++ b/src/ghga_datasteward_kit/file_ingest.py @@ -80,6 +80,10 @@ class IngestConfig(SubmissionStoreConfig): + " During the later ingest phase, the alias will be validated by the File Ingest Service." ), ) + selected_bucket_id: str = Field( + default=..., + description="Fallback bucket_id for older output metadata files that don't contain a bucket ID.", + ) def alias_to_accession( @@ -140,13 +144,17 @@ def file_ingest( """ try: output_metadata = models.OutputMetadata.load( - input_path=in_path, selected_alias=config.selected_storage_alias + input_path=in_path, + selected_alias=config.selected_storage_alias, + selected_bucket=config.selected_bucket_id, ) endpoint = config.file_ingest_federated_endpoint LOG.info("Selected non-legacy endpoint %s for file %s.", endpoint, in_path) except (KeyError, ValidationError): output_metadata = models.LegacyOutputMetadata.load( - input_path=in_path, selected_alias=config.selected_storage_alias + input_path=in_path, + selected_alias=config.selected_storage_alias, + selected_bucket=config.selected_bucket_id, ) endpoint = config.file_ingest_legacy_endpoint LOG.info("Selected legacy endpoint %s for file %s.", endpoint, in_path) diff --git a/src/ghga_datasteward_kit/models.py b/src/ghga_datasteward_kit/models.py index 236f4ba..1804625 100644 --- a/src/ghga_datasteward_kit/models.py +++ b/src/ghga_datasteward_kit/models.py @@ -71,7 +71,7 @@ class MetadataBase(BaseModel): """Common base for all output and upload models""" file_id: str - bucket_id: str | None + bucket_id: str object_id: str part_size: int unencrypted_size: int @@ -129,7 +129,7 @@ def serialize(self, output_path: Path): os.chmod(path=output_path, mode=0o400) @classmethod - def load(cls, input_path: Path, selected_alias: str): + def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): """Load metadata from serialized file""" with input_path.open("r") as infile: data = json.load(infile) @@ -148,11 +148,10 @@ def load(cls, input_path: Path, selected_alias: str): bucket_id = data["Bucket ID"] except KeyError: LOG.warning( - "Could not find bucket ID in metadata. Configure the selected_bucket_id option in" - " the file ingest service to populate older metadata with the correct value." - "Output metadata for different buckets needs to be split into different batches in this case." + "Could not find bucket ID in metadata, populating with configured alias '%s' instead.", + selected_bucket, ) - bucket_id = None + bucket_id = selected_bucket file_id = data["File UUID"] part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2 @@ -226,7 +225,7 @@ def serialize(self, output_path: Path): os.chmod(path=output_path, mode=0o400) @classmethod - def load(cls, input_path: Path, selected_alias: str): + def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): """Load metadata from serialized file""" with input_path.open("r") as infile: data = json.load(infile) @@ -245,11 +244,10 @@ def load(cls, input_path: Path, selected_alias: str): bucket_id = data["Bucket ID"] except KeyError: LOG.warning( - "Could not find bucket ID in metadata. Configure the selected_bucket_id option in" - " the file ingest service to populate older metadata with the correct value." - "Output metadata for different buckets needs to be split into different batches in this case." + "Could not find bucket ID in metadata, populating with configured alias '%s' instead.", + selected_bucket, ) - bucket_id = None + bucket_id = selected_bucket file_id = data["File UUID"] part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2 diff --git a/src/ghga_datasteward_kit/s3_upload/config.py b/src/ghga_datasteward_kit/s3_upload/config.py index e559c3e..c369c57 100644 --- a/src/ghga_datasteward_kit/s3_upload/config.py +++ b/src/ghga_datasteward_kit/s3_upload/config.py @@ -119,9 +119,6 @@ class LegacyConfig(S3ObjectStoragesConfig): default=5, description="Number of times a request should be retried on non critical errors.", ) - debug: bool = Field( - default=False, description="Enable debug functionality for upload." - ) @field_validator("output_dir") def expand_env_vars_output_dir(cls, output_dir: Path): # noqa: N805 From 90b7723754d3ba2496fc2f5bf835cdbebedef665 Mon Sep 17 00:00:00 2001 From: "Thomas J. Zajac" Date: Mon, 30 Sep 2024 13:02:52 +0000 Subject: [PATCH 04/10] Test case for fallback values + cleanup --- src/ghga_datasteward_kit/file_ingest.py | 2 +- src/ghga_datasteward_kit/s3_upload/config.py | 3 ++ .../s3_upload/entrypoint.py | 1 - .../s3_upload/uploader.py | 8 +---- tests/fixtures/ingest.py | 13 +++++-- tests/test_file_ingest.py | 36 +++++++++++++++++++ 6 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/ghga_datasteward_kit/file_ingest.py b/src/ghga_datasteward_kit/file_ingest.py index 214c9ed..c3722e5 100644 --- a/src/ghga_datasteward_kit/file_ingest.py +++ b/src/ghga_datasteward_kit/file_ingest.py @@ -187,7 +187,7 @@ def file_ingest( if response.status_code == 403: raise ValueError("Not authorized to access ingest endpoint.") if response.status_code == 422: - raise ValueError(response.json()["detail"]) + raise ValueError("Could not decrypt received payload.") if response.status_code == 500: raise ValueError( "Internal file ingest service error or communication with vault failed." diff --git a/src/ghga_datasteward_kit/s3_upload/config.py b/src/ghga_datasteward_kit/s3_upload/config.py index c369c57..e559c3e 100644 --- a/src/ghga_datasteward_kit/s3_upload/config.py +++ b/src/ghga_datasteward_kit/s3_upload/config.py @@ -119,6 +119,9 @@ class LegacyConfig(S3ObjectStoragesConfig): default=5, description="Number of times a request should be retried on non critical errors.", ) + debug: bool = Field( + default=False, description="Enable debug functionality for upload." + ) @field_validator("output_dir") def expand_env_vars_output_dir(cls, output_dir: Path): # noqa: N805 diff --git a/src/ghga_datasteward_kit/s3_upload/entrypoint.py b/src/ghga_datasteward_kit/s3_upload/entrypoint.py index aadabd1..a884444 100755 --- a/src/ghga_datasteward_kit/s3_upload/entrypoint.py +++ b/src/ghga_datasteward_kit/s3_upload/entrypoint.py @@ -70,7 +70,6 @@ async def validate_and_transfer_content( config=config, unencrypted_file_size=file_size, storage_cleaner=storage_cleaner, - debug_mode=config.debug, ) await uploader.encrypt_and_upload() diff --git a/src/ghga_datasteward_kit/s3_upload/uploader.py b/src/ghga_datasteward_kit/s3_upload/uploader.py index 4f6d32b..29d758d 100644 --- a/src/ghga_datasteward_kit/s3_upload/uploader.py +++ b/src/ghga_datasteward_kit/s3_upload/uploader.py @@ -38,10 +38,6 @@ httpx_client, ) -MAX_TIMEOUT_DEBUG = ( - 600 # maximum timeout for upload request used for debugging purposes -) - class UploadTaskHandler: """Wraps task scheduling details.""" @@ -63,14 +59,13 @@ async def gather(self): class ChunkedUploader: """Handler class dealing with upload functionality""" - def __init__( # noqa: PLR0913 + def __init__( self, input_path: Path, alias: str, config: LegacyConfig, unencrypted_file_size: int, storage_cleaner: StorageCleaner, - debug_mode: bool, ) -> None: self.alias = alias self.config = config @@ -80,7 +75,6 @@ def __init__( # noqa: PLR0913 self.unencrypted_file_size = unencrypted_file_size self.encrypted_file_size = 0 self._storage_cleaner = storage_cleaner - self.debug_mode = debug_mode async def encrypt_and_upload(self): """Delegate encryption and perform multipart upload""" diff --git a/tests/fixtures/ingest.py b/tests/fixtures/ingest.py index 1f95971..bcee07b 100644 --- a/tests/fixtures/ingest.py +++ b/tests/fixtures/ingest.py @@ -49,6 +49,9 @@ ), ) +SELECTED_STORAGE_ALIAS = "test" +SELECTED_BUCKET_ID = "selected-test-bucket" + @dataclass class IngestFixture: @@ -70,11 +73,12 @@ def legacy_ingest_fixture() -> Generator[IngestFixture, None, None]: keypair = generate_key_pair() file_path = Path(input_dir) / "test.json" - file_id = "happy_little_object" + file_id = "happy-little-object" metadata = LegacyOutputMetadata( alias="test_alias", file_id=file_id, + bucket_id="test-bucket", object_id=file_id, original_path=file_path, part_size=16 * 1024**2, @@ -95,7 +99,8 @@ def legacy_ingest_fixture() -> Generator[IngestFixture, None, None]: input_dir=Path(input_dir), map_files_fields=["study_files"], submission_store_dir=Path(submission_store_dir), - selected_storage_alias="test", + selected_storage_alias=SELECTED_STORAGE_ALIAS, + selected_bucket_id=SELECTED_BUCKET_ID, ) submission_store = SubmissionStore(config=config) @@ -124,6 +129,7 @@ def ingest_fixture() -> Generator[IngestFixture, None, None]: metadata = OutputMetadata( alias="test_alias", file_id=file_id, + bucket_id="test-bucket", object_id=file_id, original_path=file_path, part_size=16 * 1024**2, @@ -144,7 +150,8 @@ def ingest_fixture() -> Generator[IngestFixture, None, None]: input_dir=Path(input_dir), map_files_fields=["study_files"], submission_store_dir=Path(submission_store_dir), - selected_storage_alias="test", + selected_storage_alias=SELECTED_STORAGE_ALIAS, + selected_bucket_id=SELECTED_BUCKET_ID, ) submission_store = SubmissionStore(config=config) diff --git a/tests/test_file_ingest.py b/tests/test_file_ingest.py index b17129b..4003313 100644 --- a/tests/test_file_ingest.py +++ b/tests/test_file_ingest.py @@ -15,6 +15,8 @@ """File ingest tests.""" +import json + import pytest import yaml from ghga_service_commons.utils.simple_token import generate_token @@ -40,6 +42,7 @@ async def test_alias_to_accession(legacy_ingest_fixture: IngestFixture): # noqa metadata = models.LegacyOutputMetadata.load( input_path=legacy_ingest_fixture.file_path, selected_alias=legacy_ingest_fixture.config.selected_storage_alias, + selected_bucket=legacy_ingest_fixture.config.selected_bucket_id, ) accession = alias_to_accession( @@ -202,3 +205,36 @@ async def test_legacy_main( out, _ = capfd.readouterr() assert "Encountered 1 errors during processing" in out + + +def test_fallbacks( + legacy_ingest_fixture: IngestFixture, # noqa: F811 + ingest_fixture: IngestFixture, # noqa: F811 + tmp_path, +): + """Simulate loading old metadata files and test for newly populated fields""" + bucket_id = ingest_fixture.config.selected_bucket_id + storage_alias = ingest_fixture.config.selected_storage_alias + + for fixture, metadata_model in zip( + (legacy_ingest_fixture, ingest_fixture), + (models.LegacyOutputMetadata, models.OutputMetadata), + strict=True, + ): + with fixture.file_path.open("r") as source: + data = json.load(source) + + del data["Bucket ID"] + del data["Storage alias"] + + modified_metadata_path = tmp_path / "old_metadata.txt" + with modified_metadata_path.open("w") as target: + json.dump(data, target) + + metadata = metadata_model.load( + input_path=modified_metadata_path, + selected_alias=storage_alias, + selected_bucket=bucket_id, + ) + assert metadata.bucket_id == bucket_id + assert metadata.storage_alias == storage_alias From fd40c2e0833e2b59eea1b1b293e744933efb822f Mon Sep 17 00:00:00 2001 From: "Thomas J. Zajac" Date: Mon, 30 Sep 2024 13:17:58 +0000 Subject: [PATCH 05/10] Make CI pass --- tests/test_file_ingest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_file_ingest.py b/tests/test_file_ingest.py index 4003313..f8619c1 100644 --- a/tests/test_file_ingest.py +++ b/tests/test_file_ingest.py @@ -218,7 +218,7 @@ def test_fallbacks( for fixture, metadata_model in zip( (legacy_ingest_fixture, ingest_fixture), - (models.LegacyOutputMetadata, models.OutputMetadata), + (models.LegacyOutputMetadata, models.OutputMetadata), # type: ignore[arg-type] strict=True, ): with fixture.file_path.open("r") as source: @@ -231,7 +231,7 @@ def test_fallbacks( with modified_metadata_path.open("w") as target: json.dump(data, target) - metadata = metadata_model.load( + metadata = metadata_model.load( # type: ignore[attr-defined] input_path=modified_metadata_path, selected_alias=storage_alias, selected_bucket=bucket_id, From 33828b14b99625a802db13d43e181c2a85e82ae6 Mon Sep 17 00:00:00 2001 From: Thomas Zajac Date: Tue, 1 Oct 2024 14:25:22 +0200 Subject: [PATCH 06/10] Update README.md Co-authored-by: Byron Himes --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b04d4e5..3c5c98c 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ This is achieved using the data steward kit, using the following steps: `ghga-datasteward-kit files upload` for uploading a single file or the `ghga-datasteward-kit files batch-upload` for uploading multiple files at once. There also exist legacy versions of these subcommands for compatibility reasons, - where the commonad is prefixed with `legacy-`. + where the command is prefixed with `legacy-`. Please see [this section](#files-batch-upload) for further details. This will output one summary JSON per uploaded file. The encryption secret is automatically transferred to GHGA central. From 52679b1260c21a5ae6928451034a7ddf817b7b79 Mon Sep 17 00:00:00 2001 From: Thomas Zajac Date: Tue, 1 Oct 2024 14:25:30 +0200 Subject: [PATCH 07/10] Update src/ghga_datasteward_kit/models.py Co-authored-by: Byron Himes --- src/ghga_datasteward_kit/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ghga_datasteward_kit/models.py b/src/ghga_datasteward_kit/models.py index 1804625..6d9b5e6 100644 --- a/src/ghga_datasteward_kit/models.py +++ b/src/ghga_datasteward_kit/models.py @@ -148,7 +148,7 @@ def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): bucket_id = data["Bucket ID"] except KeyError: LOG.warning( - "Could not find bucket ID in metadata, populating with configured alias '%s' instead.", + "Could not find bucket ID in metadata, populating with configured bucket '%s' instead.", selected_bucket, ) bucket_id = selected_bucket From e5a700ca166254f51b13021927b22017e06cc8d3 Mon Sep 17 00:00:00 2001 From: Thomas Zajac Date: Tue, 1 Oct 2024 14:26:14 +0200 Subject: [PATCH 08/10] Update src/ghga_datasteward_kit/models.py Co-authored-by: Byron Himes --- src/ghga_datasteward_kit/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ghga_datasteward_kit/models.py b/src/ghga_datasteward_kit/models.py index 6d9b5e6..ce7a24c 100644 --- a/src/ghga_datasteward_kit/models.py +++ b/src/ghga_datasteward_kit/models.py @@ -244,7 +244,7 @@ def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): bucket_id = data["Bucket ID"] except KeyError: LOG.warning( - "Could not find bucket ID in metadata, populating with configured alias '%s' instead.", + "Could not find bucket ID in metadata, populating with configured bucket '%s' instead.", selected_bucket, ) bucket_id = selected_bucket From 94b0708cc19ebde641855a2cefb3b9aaf3ac6a1b Mon Sep 17 00:00:00 2001 From: "Thomas J. Zajac" Date: Tue, 1 Oct 2024 12:37:57 +0000 Subject: [PATCH 09/10] Renamed to fallback_bucket_id --- src/ghga_datasteward_kit/file_ingest.py | 6 +++--- tests/fixtures/ingest.py | 4 ++-- tests/test_file_ingest.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ghga_datasteward_kit/file_ingest.py b/src/ghga_datasteward_kit/file_ingest.py index c3722e5..8e54bc2 100644 --- a/src/ghga_datasteward_kit/file_ingest.py +++ b/src/ghga_datasteward_kit/file_ingest.py @@ -80,7 +80,7 @@ class IngestConfig(SubmissionStoreConfig): + " During the later ingest phase, the alias will be validated by the File Ingest Service." ), ) - selected_bucket_id: str = Field( + fallback_bucket_id: str = Field( default=..., description="Fallback bucket_id for older output metadata files that don't contain a bucket ID.", ) @@ -146,7 +146,7 @@ def file_ingest( output_metadata = models.OutputMetadata.load( input_path=in_path, selected_alias=config.selected_storage_alias, - selected_bucket=config.selected_bucket_id, + selected_bucket=config.fallback_bucket_id, ) endpoint = config.file_ingest_federated_endpoint LOG.info("Selected non-legacy endpoint %s for file %s.", endpoint, in_path) @@ -154,7 +154,7 @@ def file_ingest( output_metadata = models.LegacyOutputMetadata.load( input_path=in_path, selected_alias=config.selected_storage_alias, - selected_bucket=config.selected_bucket_id, + selected_bucket=config.fallback_bucket_id, ) endpoint = config.file_ingest_legacy_endpoint LOG.info("Selected legacy endpoint %s for file %s.", endpoint, in_path) diff --git a/tests/fixtures/ingest.py b/tests/fixtures/ingest.py index bcee07b..7dab841 100644 --- a/tests/fixtures/ingest.py +++ b/tests/fixtures/ingest.py @@ -100,7 +100,7 @@ def legacy_ingest_fixture() -> Generator[IngestFixture, None, None]: map_files_fields=["study_files"], submission_store_dir=Path(submission_store_dir), selected_storage_alias=SELECTED_STORAGE_ALIAS, - selected_bucket_id=SELECTED_BUCKET_ID, + fallback_bucket_id=SELECTED_BUCKET_ID, ) submission_store = SubmissionStore(config=config) @@ -151,7 +151,7 @@ def ingest_fixture() -> Generator[IngestFixture, None, None]: map_files_fields=["study_files"], submission_store_dir=Path(submission_store_dir), selected_storage_alias=SELECTED_STORAGE_ALIAS, - selected_bucket_id=SELECTED_BUCKET_ID, + fallback_bucket_id=SELECTED_BUCKET_ID, ) submission_store = SubmissionStore(config=config) diff --git a/tests/test_file_ingest.py b/tests/test_file_ingest.py index f8619c1..f488a97 100644 --- a/tests/test_file_ingest.py +++ b/tests/test_file_ingest.py @@ -42,7 +42,7 @@ async def test_alias_to_accession(legacy_ingest_fixture: IngestFixture): # noqa metadata = models.LegacyOutputMetadata.load( input_path=legacy_ingest_fixture.file_path, selected_alias=legacy_ingest_fixture.config.selected_storage_alias, - selected_bucket=legacy_ingest_fixture.config.selected_bucket_id, + selected_bucket=legacy_ingest_fixture.config.fallback_bucket_id, ) accession = alias_to_accession( @@ -213,7 +213,7 @@ def test_fallbacks( tmp_path, ): """Simulate loading old metadata files and test for newly populated fields""" - bucket_id = ingest_fixture.config.selected_bucket_id + bucket_id = ingest_fixture.config.fallback_bucket_id storage_alias = ingest_fixture.config.selected_storage_alias for fixture, metadata_model in zip( From 2732f9f06c4dc7cc6339eb3687a10adca13c5c14 Mon Sep 17 00:00:00 2001 From: "Thomas J. Zajac" Date: Tue, 1 Oct 2024 14:39:09 +0000 Subject: [PATCH 10/10] Select -> Fallback --- src/ghga_datasteward_kit/file_ingest.py | 4 ++-- src/ghga_datasteward_kit/models.py | 12 ++++++------ tests/fixtures/ingest.py | 6 +++--- tests/test_file_ingest.py | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/ghga_datasteward_kit/file_ingest.py b/src/ghga_datasteward_kit/file_ingest.py index 8e54bc2..fd7cf58 100644 --- a/src/ghga_datasteward_kit/file_ingest.py +++ b/src/ghga_datasteward_kit/file_ingest.py @@ -146,7 +146,7 @@ def file_ingest( output_metadata = models.OutputMetadata.load( input_path=in_path, selected_alias=config.selected_storage_alias, - selected_bucket=config.fallback_bucket_id, + fallback_bucket=config.fallback_bucket_id, ) endpoint = config.file_ingest_federated_endpoint LOG.info("Selected non-legacy endpoint %s for file %s.", endpoint, in_path) @@ -154,7 +154,7 @@ def file_ingest( output_metadata = models.LegacyOutputMetadata.load( input_path=in_path, selected_alias=config.selected_storage_alias, - selected_bucket=config.fallback_bucket_id, + fallback_bucket=config.fallback_bucket_id, ) endpoint = config.file_ingest_legacy_endpoint LOG.info("Selected legacy endpoint %s for file %s.", endpoint, in_path) diff --git a/src/ghga_datasteward_kit/models.py b/src/ghga_datasteward_kit/models.py index ce7a24c..d00b204 100644 --- a/src/ghga_datasteward_kit/models.py +++ b/src/ghga_datasteward_kit/models.py @@ -129,7 +129,7 @@ def serialize(self, output_path: Path): os.chmod(path=output_path, mode=0o400) @classmethod - def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): + def load(cls, input_path: Path, selected_alias: str, fallback_bucket: str): """Load metadata from serialized file""" with input_path.open("r") as infile: data = json.load(infile) @@ -149,9 +149,9 @@ def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): except KeyError: LOG.warning( "Could not find bucket ID in metadata, populating with configured bucket '%s' instead.", - selected_bucket, + fallback_bucket, ) - bucket_id = selected_bucket + bucket_id = fallback_bucket file_id = data["File UUID"] part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2 @@ -225,7 +225,7 @@ def serialize(self, output_path: Path): os.chmod(path=output_path, mode=0o400) @classmethod - def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): + def load(cls, input_path: Path, selected_alias: str, fallback_bucket: str): """Load metadata from serialized file""" with input_path.open("r") as infile: data = json.load(infile) @@ -245,9 +245,9 @@ def load(cls, input_path: Path, selected_alias: str, selected_bucket: str): except KeyError: LOG.warning( "Could not find bucket ID in metadata, populating with configured bucket '%s' instead.", - selected_bucket, + fallback_bucket, ) - bucket_id = selected_bucket + bucket_id = fallback_bucket file_id = data["File UUID"] part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2 diff --git a/tests/fixtures/ingest.py b/tests/fixtures/ingest.py index 7dab841..7524ec4 100644 --- a/tests/fixtures/ingest.py +++ b/tests/fixtures/ingest.py @@ -50,7 +50,7 @@ ) SELECTED_STORAGE_ALIAS = "test" -SELECTED_BUCKET_ID = "selected-test-bucket" +FALLBACK_BUCKET_ID = "fallback-test-bucket" @dataclass @@ -100,7 +100,7 @@ def legacy_ingest_fixture() -> Generator[IngestFixture, None, None]: map_files_fields=["study_files"], submission_store_dir=Path(submission_store_dir), selected_storage_alias=SELECTED_STORAGE_ALIAS, - fallback_bucket_id=SELECTED_BUCKET_ID, + fallback_bucket_id=FALLBACK_BUCKET_ID, ) submission_store = SubmissionStore(config=config) @@ -151,7 +151,7 @@ def ingest_fixture() -> Generator[IngestFixture, None, None]: map_files_fields=["study_files"], submission_store_dir=Path(submission_store_dir), selected_storage_alias=SELECTED_STORAGE_ALIAS, - fallback_bucket_id=SELECTED_BUCKET_ID, + fallback_bucket_id=FALLBACK_BUCKET_ID, ) submission_store = SubmissionStore(config=config) diff --git a/tests/test_file_ingest.py b/tests/test_file_ingest.py index f488a97..f51ec05 100644 --- a/tests/test_file_ingest.py +++ b/tests/test_file_ingest.py @@ -42,7 +42,7 @@ async def test_alias_to_accession(legacy_ingest_fixture: IngestFixture): # noqa metadata = models.LegacyOutputMetadata.load( input_path=legacy_ingest_fixture.file_path, selected_alias=legacy_ingest_fixture.config.selected_storage_alias, - selected_bucket=legacy_ingest_fixture.config.fallback_bucket_id, + fallback_bucket=legacy_ingest_fixture.config.fallback_bucket_id, ) accession = alias_to_accession( @@ -234,7 +234,7 @@ def test_fallbacks( metadata = metadata_model.load( # type: ignore[attr-defined] input_path=modified_metadata_path, selected_alias=storage_alias, - selected_bucket=bucket_id, + fallback_bucket=bucket_id, ) assert metadata.bucket_id == bucket_id assert metadata.storage_alias == storage_alias