Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into feature/model_generation
Browse files Browse the repository at this point in the history
  • Loading branch information
lkuchenb committed Jul 31, 2023
2 parents 3d38d7f + 2154b93 commit 409dd2c
Show file tree
Hide file tree
Showing 11 changed files with 303 additions and 156 deletions.
124 changes: 64 additions & 60 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,76 +2,80 @@
// https://github.com/microsoft/vscode-dev-containers/tree/v0.177.0/containers/python-3-postgres
// Update the VARIANT arg in docker-compose.yml to pick a Python version: 3, 3.8, 3.7, 3.6
{
"name": "Data Steward scripts",
"name": "Data Steward Kit",
"dockerComposeFile": "docker-compose.yml",
"service": "app",
"workspaceFolder": "/workspace",
// Set *default* container specific settings.json values on container create.
"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"sqltools.connections": [
{
"name": "Container database",
"driver": "PostgreSQL",
"previewLimit": 50,
"server": "localhost",
"port": 5432,
"database": "postgres",
"username": "postgres",
"password": "postgres"
}
],
"python.pythonPath": "/usr/local/bin/python",
"python.languageServer": "Pylance",
"python.linting.enabled": true,
"python.linting.pylintEnabled": true,
"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
"python.formatting.provider": "black",
"editor.formatOnSave": true,
"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint",
"python.testing.pytestPath": "/usr/local/py-utils/bin/pytest",
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"editor.renderWhitespace": "all",
"editor.rulers": [
88
],
"licenser.license": "AL2",
"licenser.author": "Universität Tübingen, DKFZ and EMBL\nfor the German Human Genome-Phenome Archive (GHGA)",
"customizations": {
"vscode": {
// Set *default* container specific settings.json values on container create.
"settings": {
"terminal.integrated.profiles.linux": {
"bash": {
"path": "/bin/bash"
}
},
"python.pythonPath": "/usr/local/bin/python",
"python.languageServer": "Pylance",
"python.linting.enabled": true,
"python.linting.pylintEnabled": true,
"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8",
"python.formatting.blackPath": "/usr/local/py-utils/bin/black",
"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf",
"python.formatting.provider": "black",
"python.analysis.typeCheckingMode": "basic",
"python.linting.banditPath": "/usr/local/py-utils/bin/bandit",
"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint",
"python.testing.pytestPath": "/usr/local/py-utils/bin/pytest",
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"editor.formatOnSave": true,
"editor.renderWhitespace": "all",
"editor.rulers": [
88
],
"licenser.license": "Custom",
"licenser.customHeaderFile": "/workspace/.devcontainer/license_header.txt"
},
// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"mikestead.dotenv",
"ms-azuretools.vscode-docker",
"ms-python.black-formatter",
"ms-python.python",
"ms-python.isort",
"ms-python.vscode-pylance",
"ms-toolsai.jupyter",
"vtenentes.bdd",
"njpwerner.autodocstring",
"redhat.vscode-yaml",
"42crunch.vscode-openapi",
"arjun.swagger-viewer",
"eamodio.gitlens",
"github.vscode-pull-request-github",
"streetsidesoftware.code-spell-checker",
"yzhang.markdown-all-in-one",
"visualstudioexptteam.vscodeintellicode",
"ymotongpoo.licenser",
"editorconfig.editorconfig"
]
}
},
// Add the IDs of extensions you want installed when the container is created.
"extensions": [
"ms-python.python",
"ms-python.vscode-pylance",
"mtxr.sqltools",
"mtxr.sqltools-driver-pg",
"42crunch.vscode-openapi",
"eamodio.gitlens",
"formulahendry.terminal",
"tyriar.terminal-tabs",
"alexcvzz.vscode-sqlite",
"njpwerner.autodocstring",
"arjun.swagger-viewer",
"ms-toolsai.jupyter",
"redhat.vscode-yaml",
"ymotongpoo.licenser",
"ms-azuretools.vscode-docker",
"EditorConfig.EditorConfig"
],
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [5000, 5432],
// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "dev_install",
// Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
"remoteUser": "vscode",
"features": {
"docker-in-docker": {
"ghcr.io/devcontainers/features/docker-in-docker:2": {
"version": "latest",
"moby": true
"enableNonRootDocker": "true",
"moby": true,
"azureDnsAutoDetection": false
}
}
}
6 changes: 3 additions & 3 deletions .github/workflows/pypi_publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ jobs:
pytest .
- name: Publish distribution package to PyPI (test)
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
repository_url: https://test.pypi.org/legacy/
repository-url: https://test.pypi.org/legacy/

- name: Publish distribution package to PyPI (production)
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
10 changes: 1 addition & 9 deletions ghga_datasteward_kit/cli/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,7 @@ def ingest_upload_metadata(
):
"""Upload all output metdata files from the given directory to the file ingest service"""

def dummy_generator():
"""Placeholder, needs replacement with actual implementation"""
while True:
yield "test_id"

errors = file_ingest.main(
config_path=config_path,
id_generator=dummy_generator,
)
errors = file_ingest.main(config_path=config_path)

if errors:
print(f"Encountered {len(errors)} errors during processing.")
Expand Down
73 changes: 52 additions & 21 deletions ghga_datasteward_kit/file_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,20 @@
# limitations under the License.
"""Interaction with file ingest service"""

from itertools import islice
from pathlib import Path
from typing import Callable

import httpx
from pydantic import BaseSettings, Field, ValidationError
from metldata.submission_registry.submission_store import (
SubmissionStore,
SubmissionStoreConfig,
)
from pydantic import Field, ValidationError

from ghga_datasteward_kit import models, utils


class IngestConfig(BaseSettings):
class IngestConfig(SubmissionStoreConfig):
"""Config options for calling the file ingest endpoint"""

file_ingest_url: str = Field(
Expand All @@ -38,11 +41,41 @@ class IngestConfig(BaseSettings):
description="Path to directory containing output files from the "
+ "upload/batch_upload command.",
)
map_files_fields: list[str] = Field(
["study_files"],
description="Names of the accession map fields for looking up the"
+ " alias->accession mapping.",
)


def alias_to_accession(
alias: str, map_fields: list[str], submission_store: SubmissionStore
) -> str:
"""Get all submissions to retrieve valid accessions from corresponding file aliases"""

submission_ids = submission_store.get_all_submission_ids()

all_submission_map = {}

for submission_id in submission_ids:
submission = submission_store.get_by_id(submission_id=submission_id)
for field in map_fields:
if field not in submission.accession_map:
raise ValueError(
f"Configured field {field} not found in accession map."
)
all_submission_map.update(submission.accession_map[field])

accession = all_submission_map.get(alias)

if accession is None:
raise ValueError(f"No accession exists for file alias {alias}")

return accession


def main(
config_path: Path,
id_generator: Callable[[], str],
):
"""Handle ingestion of a folder of s3 upload file metadata"""

Expand All @@ -51,37 +84,35 @@ def main(

errors = {}

# pre generate paths/ids to make sure generator procudes a sufficient amount of ids
file_paths = [
file_path
for file_path in config.input_dir.iterdir()
if file_path.suffix == ".json"
]
file_ids = list(islice(id_generator(), len(file_paths)))

if len(file_paths) != len(file_ids):
raise ValueError(
"Provided ID generator function does not create the correct amount of IDs."
+ f"\nRequired: {len(file_paths)}, generated {len(file_ids)}"
)

for in_path, file_id in zip(file_paths, file_ids):
for in_path in config.input_dir.iterdir():
if in_path.suffix != ".json":
continue
try:
file_ingest(in_path=in_path, file_id=file_id, token=token, config=config)
file_ingest(in_path=in_path, token=token, config=config)
except (ValidationError, ValueError) as error:
errors[in_path.resolve()] = str(error)
continue

return errors


def file_ingest(in_path: Path, file_id: str, token: str, config: IngestConfig):
def file_ingest(
in_path: Path,
token: str,
config: IngestConfig,
alias_to_id: Callable[[str, list[str], SubmissionStore], str] = alias_to_accession,
):
"""
Transform from s3 upload output representation to what the file ingest service expects.
Then call the ingest endpoint
"""

submission_store = SubmissionStore(config=config)

output_metadata = models.OutputMetadata.load(input_path=in_path)
file_id = alias_to_id(
output_metadata.alias, config.map_files_fields, submission_store
)
upload_metadata = output_metadata.to_upload_metadata(file_id=file_id)
encrypted = upload_metadata.encrypt_metadata(pubkey=config.file_ingest_pubkey)

Expand Down
Loading

0 comments on commit 409dd2c

Please sign in to comment.