Skip to content

Commit

Permalink
CELE-99 Add summary.txt generation
Browse files Browse the repository at this point in the history
  • Loading branch information
aranega committed Sep 26, 2024
1 parent 4bd83d5 commit 44265f9
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 19 deletions.
14 changes: 3 additions & 11 deletions data/db-raw-data/summary.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
db-raw-data/datasets.json
db-raw-data/neurons.json
db-raw-data/annotations/complete.annotations.json
db-raw-data/annotations/head.annotations.json
db-raw-data/trajectories/witvliet_2020_7.json
db-raw-data/trajectories/witvliet_2020_4.json
db-raw-data/trajectories/witvliet_2020_6.json
db-raw-data/trajectories/witvliet_2020_8.json
db-raw-data/trajectories/witvliet_2020_1.json
db-raw-data/trajectories/witvliet_2020_3.json
db-raw-data/trajectories/witvliet_2020_5.json
db-raw-data/trajectories/witvliet_2020_2.json
db-raw-data/datasets.json
db-raw-data/connections/white_1986_jse.json
db-raw-data/connections/witvliet_2020_7.json
db-raw-data/connections/witvliet_2020_4.json
Expand All @@ -22,3 +12,5 @@ db-raw-data/connections/white_1986_whole.json
db-raw-data/connections/witvliet_2020_3.json
db-raw-data/connections/witvliet_2020_5.json
db-raw-data/connections/witvliet_2020_2.json
db-raw-data/annotations/head.annotations.json
db-raw-data/annotations/complete.annotations.json
15 changes: 10 additions & 5 deletions ingestion/ingestion/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import os
import sys
import tempfile
from argparse import ArgumentParser, Namespace
from pathlib import Path
from time import sleep
Expand Down Expand Up @@ -166,16 +167,20 @@ def validate_and_upload_data(

logger.info(f"Uploading raw data...")

paths: list[Path] = [data_files.neurons, data_files.datasets]
paths.extend(conn for conn in data_files.connections.values())
paths.extend(ann for ann in data_files.annotations.values())
paths: list[Path] = list(data_files.all_paths())

pbar = tqdm(paths, disable=rs.dry_run)
for p in pbar:
pbar.set_description(str(p))
rs.upload(p, fs_data_blob_name(dataset_id, p, dir), overwrite=overwrite)
rs.upload(p, fs_data_blob_name(p, dir), overwrite=overwrite)

logger.info(f"Done uploading raw data!")
logger.info("Done uploading raw data!")

logger.info("Building the summary.txt file...")
summary_file = dir / "summary.txt"
summary_file.write_text("\n".join(fs_data_blob_name(file, dir) for file in paths))
rs.upload(summary_file, fs_data_blob_name(summary_file, dir), overwrite=overwrite)
logger.info("Done uploading summary.txt")


def prune_bucket(bucket: storage.Bucket | FakeBucket):
Expand Down
9 changes: 8 additions & 1 deletion ingestion/ingestion/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from dataclasses import dataclass, field
from enum import Enum, IntEnum
from typing import Generic, Literal, TypeVar
from itertools import chain
from typing import Generator, Generic, Literal, TypeVar

from pydantic import BaseModel, Field, RootModel, model_validator

Expand Down Expand Up @@ -122,3 +123,9 @@ class DataContainer(Generic[T]):
datasets: T
connections: dict[str, T] = field(default_factory=dict) # dataset_name: T
annotations: dict[DataAnnotationEntry, T] = field(default_factory=dict)

def all_paths(self) -> Generator[T]:
yield self.neurons
yield self.datasets
yield from self.connections.values()
yield from self.annotations.values()
2 changes: 1 addition & 1 deletion ingestion/ingestion/storage/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ingestion.storage.filesystem import SEGMENTATION_REGEX


def fs_data_blob_name(dataset_id: str, p: Path, base_dir: Path) -> str:
def fs_data_blob_name(p: Path, base_dir: Path) -> str:
return f"db-raw-data/{p.relative_to(base_dir)}"


Expand Down
2 changes: 1 addition & 1 deletion ingestion/tests/storage/test_blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
def test__fs_data_blob_name(
dataset_id: str, file_path: Path, base_dir: Path, blob_name: str
):
assert fs_data_blob_name(dataset_id, file_path, base_dir) == blob_name
assert fs_data_blob_name(file_path, base_dir) == blob_name


@pytest.mark.parametrize(
Expand Down

0 comments on commit 44265f9

Please sign in to comment.