Skip to content

Commit

Permalink
Structural changes to S3 upload + legacy/non-legacy-split wip
Browse files Browse the repository at this point in the history
  • Loading branch information
mephenor committed Oct 24, 2023
1 parent 114a297 commit fd562e3
Show file tree
Hide file tree
Showing 13 changed files with 1,056 additions and 634 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ RUN if [ "${INSTALL_NODE}" = "true" ]; then su vscode -c "umask 0002 && . /usr/l
RUN apt update \
&& export DEBIAN_FRONTEND=noninteractive \
&& apt -y install --no-install-recommends postgresql-client-common \
&& apt-get install --fix-missing libcurl4-openssl-dev libssl-dev
&& apt -y install --fix-missing libcurl4-openssl-dev libssl-dev

# Copy install and launcher script to bin:
COPY ./dev_install /bin
Expand Down
53 changes: 53 additions & 0 deletions ghga_datasteward_kit/cli/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@
cli = typer.Typer()


@cli.command()
def legacy_upload(
input_path: Path = typer.Option(..., help="Local path of the input file"),
alias: str = typer.Option(..., help="A human readable file alias"),
config_path: Path = typer.Option(..., help="Path to a config YAML."),
):
"""Upload a single file to S3."""

s3_upload.legacy_main(input_path=input_path, alias=alias, config_path=config_path)


@cli.command()
def upload(
input_path: Path = typer.Option(..., help="Local path of the input file"),
Expand All @@ -34,6 +45,32 @@ def upload(
s3_upload.main(input_path=input_path, alias=alias, config_path=config_path)


@cli.command()
def legacy_batch_upload(
tsv: Path = typer.Option(
...,
help=(
"Path to a tsv file with the first column containing the file path and the"
+ " second column containing the file alias."
),
),
config_path: Path = typer.Option(..., help="Path to a config YAML."),
parallel_processes: int = typer.Option(..., help="Number of parallel uploads."),
dry_run: bool = typer.Option(
False,
help=("Only print commands for each file." + " No uploads are performed."),
),
):
"""Upload multiple files to S3."""

batch_s3_upload.main(
file_overview_tsv=tsv,
config_path=config_path,
parallel_processes=parallel_processes,
dry_run=dry_run,
)


@cli.command()
def batch_upload(
tsv: Path = typer.Option(
Expand All @@ -60,6 +97,22 @@ def batch_upload(
)


@cli.command()
def legacy_ingest_upload_metadata(
config_path: Path = typer.Option(..., help="Path to a config YAML."),
):
"""Upload all output metdata files from the given directory to the file ingest service"""

errors = file_ingest.main(config_path=config_path)

if errors:
print(f"Encountered {len(errors)} errors during processing.")
for file_path, cause in errors.items():
print(f" -{file_path}: {cause}")
else:
print("Sucessfully sent all file upload metadata for ingest.")


@cli.command()
def ingest_upload_metadata(
config_path: Path = typer.Option(..., help="Path to a config YAML."),
Expand Down
108 changes: 99 additions & 9 deletions ghga_datasteward_kit/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,28 @@ def update_encrypted(self, part: bytes):


@dataclass
class OutputMetadata: # pylint: disable=too-many-instance-attributes
class OutputMetadataBase: # pylint: disable=too-many-instance-attributes
"""Container class for output metadata"""

alias: str
file_uuid: str
original_path: Path
part_size: int
file_secret: bytes
unencrypted_checksum: str
encrypted_md5_checksums: list[str]
encrypted_sha256_checksums: list[str]
unencrypted_size: int
encrypted_size: int


@dataclass
class LegacyOutputMetadata(
OutputMetadataBase
): # pylint: disable=too-many-instance-attributes
"""Container class for output metadata"""

file_secret: bytes

def serialize(self, output_path: Path):
"""Serialize metadata to file"""

Expand Down Expand Up @@ -104,7 +112,7 @@ def serialize(self, output_path: Path):
def to_upload_metadata(self, file_id: str):
"""Convert internal output file representation to unencrypted request model"""

return FileUploadMetadata(
return LegacyFileUploadMetadata(
file_id=file_id,
object_id=self.file_uuid,
part_size=self.part_size,
Expand All @@ -125,7 +133,7 @@ def load(cls, input_path: Path):

part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2

return OutputMetadata(
return LegacyOutputMetadata(
alias=data["Alias"],
file_uuid=data["File UUID"],
original_path=Path(data["Original filesystem path"]),
Expand All @@ -139,7 +147,78 @@ def load(cls, input_path: Path):
)


class FileUploadMetadata(BaseModel):
@dataclass
class OutputMetadata(
OutputMetadataBase
): # pylint: disable=too-many-instance-attributes
"""Container class for output metadata"""

secret_id: str

def serialize(self, output_path: Path):
"""Serialize metadata to file"""

output: dict[str, Any] = {}
output["Alias"] = self.alias
output["File UUID"] = self.file_uuid
output["Original filesystem path"] = str(self.original_path.resolve())
output["Part Size"] = f"{self.part_size // 1024**2} MiB"
output["Unencrypted file size"] = self.unencrypted_size
output["Encrypted file size"] = self.encrypted_size
output["Symmetric file encryption secret ID"] = self.secret_id
output["Unencrypted file checksum"] = self.unencrypted_checksum
output["Encrypted file part checksums (MD5)"] = self.encrypted_md5_checksums
output[
"Encrypted file part checksums (SHA256)"
] = self.encrypted_sha256_checksums

if not output_path.parent.exists():
output_path.mkdir(parents=True)

# owner read-only
with output_path.open("w") as file:
json.dump(output, file, indent=2)
os.chmod(path=output_path, mode=0o400)

def to_upload_metadata(self, file_id: str):
"""Convert internal output file representation to unencrypted request model"""

return FileUploadMetadata(
file_id=file_id,
object_id=self.file_uuid,
part_size=self.part_size,
unencrypted_size=self.unencrypted_size,
encrypted_size=self.encrypted_size,
secret_id=self.secret_id,
unencrypted_checksum=self.unencrypted_checksum,
encrypted_md5_checksums=self.encrypted_md5_checksums,
encrypted_sha256_checksums=self.encrypted_sha256_checksums,
)

@classmethod
def load(cls, input_path: Path):
"""Load metadata from serialized file"""

with input_path.open("r") as infile:
data = json.load(infile)

part_size = int(data["Part Size"].rpartition(" MiB")[0]) * 1024**2

return OutputMetadata(
alias=data["Alias"],
file_uuid=data["File UUID"],
original_path=Path(data["Original filesystem path"]),
part_size=part_size,
secret_id=data["Symmetric file encryption secret ID"],
unencrypted_checksum=data["Unencrypted file checksum"],
encrypted_md5_checksums=data["Encrypted file part checksums (MD5)"],
encrypted_sha256_checksums=data["Encrypted file part checksums (SHA256)"],
unencrypted_size=int(data["Unencrypted file size"]),
encrypted_size=int(data["Encrypted file size"]),
)


class FileUploadMetadataBase(BaseModel):
"""Decrypted payload model for S3 upload script output"""

# get all data for now, optimize later if we don't need all of it
Expand All @@ -148,7 +227,6 @@ class FileUploadMetadata(BaseModel):
part_size: int
unencrypted_size: int
encrypted_size: int
file_secret: str
unencrypted_checksum: str
encrypted_md5_checksums: list[str]
encrypted_sha256_checksums: list[str]
Expand All @@ -159,10 +237,22 @@ def encrypt_metadata(self, pubkey: str):
payload = self.json()
encrypted = encrypt(data=payload, key=pubkey)

return FileUploadMetadataEncrypted(payload=encrypted)
return EncryptedPayload(payload=encrypted)


class LegacyFileUploadMetadata(FileUploadMetadataBase):
"""Decrypted payload model for S3 upload script output"""

file_secret: str


class FileUploadMetadata(FileUploadMetadataBase):
"""Decrypted payload model for S3 upload script output"""

secret_id: str


class FileUploadMetadataEncrypted(BaseModel):
"""Encrypted file upload metadata model"""
class EncryptedPayload(BaseModel):
"""Contains encrypted upload metadata or secret as payload"""

payload: str
Loading

0 comments on commit fd562e3

Please sign in to comment.