MetaCell · aranega · Aug 28, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 16, 2024
diff --git a/format-ingestion.md b/format-ingestion.md
@@ -6,7 +6,7 @@ Different files are necessary:
 * `neurons.json` that encodes the information about the neurons in general
 * `datasets.json` that encodes the information about the different datasets
 * `connections/xxx.json` that encodes the different connections for dedicated datasets
-* `annotations/xxx.json` that encodes annotatinos for different zones of the anatomy
+* `annotations/xxx.json` that encodes annotations for different zones of the anatomy
 
 Those files are automatically exported from third-party tool and shouldn't be edited manually.
 
@@ -63,9 +63,9 @@ Each JSON object represents a specific dataset with this schema:
 {
     "id": string           // unique ID for the dataset
     "name": string         // display name of the dataset
-    "type": string         // type of dataset: "complete" or "head"
-    "time": int            // time of the dataset
-    "visualTime": int      // visualTime of the dataset
+    "type": string         // type of dataset: "complete", "head" or "tail"
+    "time": float          // time of the dataset
+    "visualTime": float    // visualTime of the dataset
     "description": string  // description of the dataset
     "axes": [              // OPTIONAL: different axes and their representation, not used but can appear in the file
         ...
@@ -89,11 +89,11 @@ The schema is the following:
     "pre": string,        // the name of a neuron as defined in "neurons.json"
     "pre_tid": [ ... ],   // a list of int where each int represents the ID of a pre synapse for a dedicated pre neuron
     "syn": [ ... ],       // a list of int where each int represents the weight of a post or pre synapses (indice matches the neuron in pre/post_tid)
-    "typ": int            // the type of connection ("electrical" or "chemical")
+    "typ": int            // the type of connection ("electrical" (0) or "chemical" (2))
 }
 ```
 
-For each of those objects: `ids`, `post_tid`, `pre_tid` and `syn` need to have the same number of elements.
+For each of those objects: `ids`, `post_tid`, `pre_tid` and `syn` need to have the same number of elements when `ids` is present.
 
 ## Format of `annotations/xxx.json`
 

diff --git a/ingestion/ingestion/__init__.py b/ingestion/ingestion/__init__.py
@@ -0,0 +1 @@
+# This is intentionally left blank
diff --git a/ingestion/ingestion/validator.py b/ingestion/ingestion/validator.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from enum import Enum, IntEnum
+from typing import Literal
+
+from pydantic import BaseModel, Field, RootModel, model_validator
+
+
+class Neuron(BaseModel):
+    inhead: bool  # int used as bool, is the neuron part of the head or not
+    name: str  # name of the neuron, can be same as classes, or L or R of classes
+    emb: bool  # int used as bool
+    nt: str  # neurotransmitter type
+    intail: bool  # int used as bool
+    classes: str  # general name of the neuron
+    typ: str  # type of the neuron
+
+
+class DatasetType(str, Enum):
+    COMPLETE = "complete"
+    HEAD = "head"
+    TAIL = "tail"
+
+
+class Axe(BaseModel):
+    face: str
+    axisIndex: int
+    axisTransform: int
+
+
+class Dataset(BaseModel):
+    id: str
+    name: str
+    type: DatasetType
+    time: float  # TODO: should add validation gte than 0?
+    visualTime: float  # TODO: should add validation gte than 0?
+    description: str
+    axes: list[Axe] | None = Field(
+        default=None, description="different axes and their representation"
+    )
+
+
+class ConnectionType(IntEnum):
+    ELECTRICAL = 0
+    CHEMICAL = 2
+
+
+class Connection(BaseModel):
+    ids: list[int] = Field(
+        default_factory=list,
+        description="list of neuron IDs involved in this connection",
+    )
+    post: str  # the name of a neuron as defined in "neurons.json"
+    post_tid: list[int] = Field(
+        default_factory=list,
+        description="list of neuron IDs of a post synapse for a dedicated post neuron",
+    )
+    pre: str  # the name of a neuron as defined in "neurons.json"
+    pre_tid: list[int] = Field(
+        default_factory=list,
+        description="list of neuron IDs of a pre synapse for a dedicated pre neuron",
+    )
+    syn: list[int] = Field(
+        ...,
+        description="list of weights of a post or pre synapses (indice matches the neuron in pre/post_tid)",
+    )
+    typ: ConnectionType  # the type of connection ("electrical" or "chemical")
+
+    @model_validator(mode="after")
+    def check_same_size_elements(self):
+        if len(self.ids) != 0:
+            assert (
+                len(self.ids)
+                == len(self.post_tid)
+                == len(self.pre_tid)
+                == len(self.syn)
+            ), "ids, post_tid, pre_tid and syn must have the same number of elements"
+
+        return self
+
+
+class Annotation(RootModel):
+    root: dict[
+        Literal["increase", "variable", "postembryonic", "decrease", "stable"],
+        list[
+            tuple[  # the type of annotation
+                str,  # pre, the ID/name of a neuron from "neurons.json"
+                str,  # post, the ID/name of the other neuron from "neurons.json" that is part of the couple
+            ]
+        ],
+    ] = {}
+
+
+class Data(BaseModel):
+    neurons: list[Neuron]
+    datasets: list[Dataset]
+    connections: dict[str, list[Connection]] = {}
+    annotations: dict[Literal["head", "complete", "tail"], Annotation] = {}
+
+    @model_validator(mode="after")
+    def check_connection_dataset_exists(self):
+        existing_datasets = [dt.id for dt in self.datasets]
+        assert all(
+            dataset_id in existing_datasets for dataset_id in self.connections.keys()
+        ), "missing dataset definition for connection"
+        return self
diff --git a/ingestion/pyproject.toml b/ingestion/pyproject.toml
@@ -0,0 +1,72 @@
+[build-system] 
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "ingestion"
+version = "0.0.1"
+description = "CLI tool to ingest c-elegans data"
+readme = "README.md"
+requires-python = ">=3.10"
+authors = [
+    { name = "Vincent Aranega", email = "[email protected]" },
+    { name = "Diogo Correia", email = "[email protected]" },
+]
+maintainers = [
+    { name = "Vincent Aranega", email = "[email protected]" },
+    { name = "Diogo Correia", email = "[email protected]" },
+]
+dependencies = [
+    "pydantic==2.8.2",
+]
+
+[project.optional-dependencies]
+dev = [
+    "black>=24.8.0",
+    "coverage>=7.6.1",
+    "isort>=5.13.2",
+    "mypy==1.11.1", # lock version: manual upgrade is advised
+    "pytest>=8",
+    "pytest-asyncio",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]  # list of folders that contain the packages (["."] by default)
+include = ["*"]  # package names should match these glob patterns (["*"] by default)
+exclude = [
+    "tests*",
+]  # exclude packages matching these glob patterns (empty by default)
+namespaces = false  # false to disable scanning PEP 420 namespaces (true by default)
+
+[tool.black]
+line-length = 88
+target-version = ['py310']
+include = '\.pyi?$'
+
+[tool.isort]
+profile = "black"
+line_length = 88
+src_paths = ["ingestion", "tests"]
+add_imports = ["from __future__ import annotations"]
+
+[tool.mypy]
+python_version = "3.10"
+
+[tool.pytest.ini_options]
+minversion = "8.0"
+addopts = "-v"
+asyncio_mode = "strict"
+testpaths = ["tests"]
+
+[tool.coverage.run]
+branch = true
+source = ["ingestion"]
+omit = [
+    "venv/*",
+    ".venv/*",
+    "tests/*",
+]
+
+[tool.coverage.report]
+show_missing = true
+fail_under = 0