diff --git a/data/db-raw-data/summary.txt b/data/db-raw-data/summary.txt index 54624244..158a31a5 100644 --- a/data/db-raw-data/summary.txt +++ b/data/db-raw-data/summary.txt @@ -1,15 +1,5 @@ -db-raw-data/datasets.json db-raw-data/neurons.json -db-raw-data/annotations/complete.annotations.json -db-raw-data/annotations/head.annotations.json -db-raw-data/trajectories/witvliet_2020_7.json -db-raw-data/trajectories/witvliet_2020_4.json -db-raw-data/trajectories/witvliet_2020_6.json -db-raw-data/trajectories/witvliet_2020_8.json -db-raw-data/trajectories/witvliet_2020_1.json -db-raw-data/trajectories/witvliet_2020_3.json -db-raw-data/trajectories/witvliet_2020_5.json -db-raw-data/trajectories/witvliet_2020_2.json +db-raw-data/datasets.json db-raw-data/connections/white_1986_jse.json db-raw-data/connections/witvliet_2020_7.json db-raw-data/connections/witvliet_2020_4.json @@ -22,3 +12,5 @@ db-raw-data/connections/white_1986_whole.json db-raw-data/connections/witvliet_2020_3.json db-raw-data/connections/witvliet_2020_5.json db-raw-data/connections/witvliet_2020_2.json +db-raw-data/annotations/head.annotations.json +db-raw-data/annotations/complete.annotations.json \ No newline at end of file diff --git a/ingestion/ingestion/ingest.py b/ingestion/ingestion/ingest.py index 7da22e0b..fbf27eea 100644 --- a/ingestion/ingestion/ingest.py +++ b/ingestion/ingestion/ingest.py @@ -4,6 +4,7 @@ import logging import os import sys +import tempfile from argparse import ArgumentParser, Namespace from pathlib import Path from time import sleep @@ -166,16 +167,20 @@ def validate_and_upload_data( logger.info(f"Uploading raw data...") - paths: list[Path] = [data_files.neurons, data_files.datasets] - paths.extend(conn for conn in data_files.connections.values()) - paths.extend(ann for ann in data_files.annotations.values()) + paths: list[Path] = list(data_files.all_paths()) pbar = tqdm(paths, disable=rs.dry_run) for p in pbar: pbar.set_description(str(p)) - rs.upload(p, fs_data_blob_name(dataset_id, p, dir), overwrite=overwrite) + rs.upload(p, fs_data_blob_name(p, dir), overwrite=overwrite) - logger.info(f"Done uploading raw data!") + logger.info("Done uploading raw data!") + + logger.info("Building the summary.txt file...") + summary_file = dir / "summary.txt" + summary_file.write_text("\n".join(fs_data_blob_name(file, dir) for file in paths)) + rs.upload(summary_file, fs_data_blob_name(summary_file, dir), overwrite=overwrite) + logger.info("Done uploading summary.txt") def prune_bucket(bucket: storage.Bucket | FakeBucket): diff --git a/ingestion/ingestion/schema.py b/ingestion/ingestion/schema.py index 06dcc866..688e321f 100644 --- a/ingestion/ingestion/schema.py +++ b/ingestion/ingestion/schema.py @@ -2,7 +2,8 @@ from dataclasses import dataclass, field from enum import Enum, IntEnum -from typing import Generic, Literal, TypeVar +from itertools import chain +from typing import Generator, Generic, Literal, TypeVar from pydantic import BaseModel, Field, RootModel, model_validator @@ -122,3 +123,9 @@ class DataContainer(Generic[T]): datasets: T connections: dict[str, T] = field(default_factory=dict) # dataset_name: T annotations: dict[DataAnnotationEntry, T] = field(default_factory=dict) + + def all_paths(self) -> Generator[T]: + yield self.neurons + yield self.datasets + yield from self.connections.values() + yield from self.annotations.values() diff --git a/ingestion/ingestion/storage/blob.py b/ingestion/ingestion/storage/blob.py index de8a8c67..806e327c 100644 --- a/ingestion/ingestion/storage/blob.py +++ b/ingestion/ingestion/storage/blob.py @@ -7,7 +7,7 @@ from ingestion.storage.filesystem import SEGMENTATION_REGEX -def fs_data_blob_name(dataset_id: str, p: Path, base_dir: Path) -> str: +def fs_data_blob_name(p: Path, base_dir: Path) -> str: return f"db-raw-data/{p.relative_to(base_dir)}" diff --git a/ingestion/tests/storage/test_blob.py b/ingestion/tests/storage/test_blob.py index 629d215b..108fc443 100644 --- a/ingestion/tests/storage/test_blob.py +++ b/ingestion/tests/storage/test_blob.py @@ -45,7 +45,7 @@ def test__fs_data_blob_name( dataset_id: str, file_path: Path, base_dir: Path, blob_name: str ): - assert fs_data_blob_name(dataset_id, file_path, base_dir) == blob_name + assert fs_data_blob_name(file_path, base_dir) == blob_name @pytest.mark.parametrize(