Create scivision projects catalog and add first entry for testing pur…

…poses (#475) * Add para * move docs for python package in navbar * add pypi link * rename to scivision * summary md that can be imported in 3 locations, README, docs and web * summary updated * move summary into frontend * move AboutText to separate file * use react-showdown to render markdown * move mdsummary into about.js * add GitHub to navbar * add ➕ emoji to navbar * simplify headers * make README consistent with scivision web * move support table below installation * make readme header consistent with readthedocs * make readthedocs intro consistent with README and webapp (bad links) * add header and image to make webapp consistent with readthedocs and GH * reduce column width * consistent/tidy descriptions * minor differences between the 3 landing pages * improve readthedocs landing page links * add user guide link * add support table to readthedocs and make install consistent with README * move install instructions out of README * add first attempt at a projects.json * add authors and contributors * add a header field * use plural for fields that are likely plural * create CatalogProjectEntry * create CatalogProjects * add _coerce_projects_catalog * fix error msg * add projects to PandasCatalog * whitespace and missing var * add missing comma * fix json * stop mistakenly using TaskEnum * duplicate models code for projects * add links to thumbnails * import projects json * fix plural * add projects pages * move model-grid position in App.js * add project-grid * add links * fix model-grid * move for clarity * remove not needed * remove frontend changes * make the project entry about the connections workshop rather than a tutorial * move header above desc * add description to project catalog * merge catalog changes from react branch * remove unused import * flake8
alan-turing-institute · Jan 17, 2023 · 3a9f5f4 · 3a9f5f4
1 parent 04625ee
commit 3a9f5f4
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 2 deletions.
diff --git a/scivision/catalog/catalog.py b/scivision/catalog/catalog.py
@@ -180,6 +180,105 @@ def name_unique_key(cls, entries):
         return entries
 
 
+def get_models():
+    models_raw = pkgutil.get_data(__name__, "data/models.json")
+    models = CatalogModels.parse_raw(models_raw)
+    names = []
+    for model_entry in models.entries:
+        names.append(model_entry["name"])
+    return names
+
+
+modelEnumStrings = ((x, x) for x in get_models())
+ModelEnum = Enum('ModelEnum', modelEnumStrings)
+
+
+def get_datasources():
+    datasources_raw = pkgutil.get_data(__name__, "data/datasources.json")
+    datasources = CatalogDatasources.parse_raw(datasources_raw)
+    names = []
+    for datasources_entry in datasources.entries:
+        names.append(datasources_entry["name"])
+    return names
+
+
+datasourceEnumStrings = ((x, x) for x in get_datasources())
+DataEnum = Enum('DataEnum', datasourceEnumStrings)
+
+
+class CatalogProjectEntry(BaseModel, extra="forbid", title="A project catalog entry"):
+    # tasks, institution and tags are Tuples (rather than Lists) so
+    # that they are immutable - Tuple is being used as an immutable
+    # sequence here. This means that these fields are hashable, which
+    # can be more convenient when included in a dataframe
+    # (e.g. unique()). Could consider using a Frozenset for these
+    # fields instead, since duplicates and ordering should not be
+    # significant.
+    name: str = Field(
+        ...,
+        title="Name",
+        description="Short, unique name for the project (one or two words, "
+        "under 20 characters recommended)",
+    )
+    header: str = Field(
+        ...,
+        title="Header",
+        description="Header that will display at the top of the project page",
+    )
+    description: Optional[str] = Field(
+        None,
+        title="Description",
+        description="Short description of the project (that will appear when hovering on the project thumbnail)",
+    )
+    page: str = Field(
+        None,
+        title="Page",
+        description="Markdown formatted content for the project page",
+    )
+    models: Tuple[ModelEnum, ...] = Field(
+        (),
+        title="Models",
+        description="Which models from the scivision catalog are used in the project?",
+    )
+    datasources: Tuple[DataEnum, ...] = Field(
+        (),
+        title="Datasources",
+        description="Which datasources from the scivision catalog are used in the project?",
+    )
+    tasks: Tuple[TaskEnum, ...] = Field(
+        (),
+        title="Tasks",
+        description="Which task (or tasks) do the CV models used in the project perform?",
+    )
+    institution: Tuple[str, ...] = Field(
+        (),
+        title="Institution(s)",
+        description="A list of institutions that produced or are associated with "
+        "the project (one per item)",
+    )
+    tags: Tuple[str, ...]
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+
+class CatalogProjects(BaseModel, extra="forbid"):
+    catalog_type: str = "scivision project catalog"
+    name: str
+    # Tuple: see comment on CatalogProjectEntry
+    entries: Tuple[CatalogProjectEntry, ...]
+
+    @validator("entries")
+    def name_unique_key(cls, entries):
+        name_counts = Counter([entry['name'] for entry in entries])
+        dups = [item for item, count in name_counts.items() if count > 1]
+
+        if dups:
+            raise ValueError(f"The 'name' field in the project catalog should be unique (duplicates: {dups})")
+
+        return entries
+
+
 def _coerce_datasources_catalog(
     datasources: Union[CatalogDatasources, os.PathLike, None]
 ) -> CatalogDatasources:
@@ -213,7 +312,25 @@ def _coerce_models_catalog(
         models_raw = pkgutil.get_data(__name__, "data/models.json")
         return CatalogModels.parse_raw(models_raw)
     else:
-        raise TypeError("Cannot load datasource from unsupported type")
+        raise TypeError("Cannot load model from unsupported type")
+
+
+def _coerce_projects_catalog(
+    projects: Union[CatalogProjects, os.PathLike, None]
+) -> CatalogProjects:
+    """Returns a CatalogProjects determined from the argument: either the
+    one passed, or one loaded from a file
+    """
+    if isinstance(projects, CatalogProjects):
+        return projects
+    elif isinstance(projects, (bytes, str, os.PathLike)):
+        projects_raw = Path(projects).read_text()
+        return CatalogProjects.parse_raw(projects_raw)
+    elif projects is None:
+        projects_raw = pkgutil.get_data(__name__, "data/projects.json")
+        return CatalogProjects.parse_raw(projects_raw)
+    else:
+        raise TypeError("Cannot load project from unsupported type")
 
 
 class QueryResult(ABC):
@@ -234,7 +351,7 @@ def to_dataframe(self) -> pd.DataFrame:
 
 
 class PandasCatalog:
-    def __init__(self, datasources=None, models=None):
+    def __init__(self, datasources=None, models=None, projects=None):
         super().__init__()
 
         if isinstance(datasources, pd.DataFrame):
@@ -251,6 +368,12 @@ def __init__(self, datasources=None, models=None):
             models_cat = _coerce_models_catalog(models)
             self._models = pd.DataFrame([ent.dict() for ent in models_cat.entries])
 
+        if isinstance(projects, pd.DataFrame):
+            self._projects = projects
+        else:
+            projects_cat = _coerce_projects_catalog(projects)
+            self._projects = pd.DataFrame([ent.dict() for ent in projects_cat.entries])
+
     @property
     def models(self) -> PandasQueryResult:
         return PandasQueryResult(self._models)
@@ -259,6 +382,10 @@ def models(self) -> PandasQueryResult:
     def datasources(self) -> PandasQueryResult:
         return PandasQueryResult(self._datasources)
 
+    @property
+    def projects(self) -> PandasQueryResult:
+        return PandasQueryResult(self._projects)
+
     def _compatible_models(self, datasource) -> PandasQueryResult:
         models_compatible_format = self._models[
             self._models.format == datasource["format"]

diff --git a/scivision/catalog/data/projects.json b/scivision/catalog/data/projects.json
@@ -0,0 +1,31 @@
+{
+  "catalog_type": "scivision project catalog",
+  "name": "default",
+  "entries": [
+    {
+      "name": "sept22-connections-workshop",
+      "header": "Turing Connections Workshop",
+      "description": "Classification challenge for datasets of Flower and Butterfly images from Kaggle",
+      "page": "PhD candidates attended a workshop hosted by The Alan Turing Institute's Research Engineering Group (REG) to get some hands-on experience working on scivision as a piece of research software. The task involved adding pre-trained models and datasources to the scivision catalog. Succesful submissions include two classification models, for flowers and butterflies, and related datasets.\n#### Notebooks:\n- [Butterflies](https://github.com/scivision-gallery/connections-workshop-examples/blob/main/ButterflyClassification.ipynb)\n- [Flowers](https://github.com/scivision-gallery/connections-workshop-examples/blob/main/FlowerClassification.ipynb)\n\nTo learn more about the REG at Turing, click [here](https://www.turing.ac.uk/research/research-engineering)",
+      "tasks": [
+        "classification"
+      ],
+      "models": [
+        "butterfly-classification-model",
+        "flower-classification-model"
+      ],
+      "datasources": [
+        "flowers",
+        "butterflies"
+      ],
+      "institution": [
+        "alan-turing-institute"
+      ],
+      "tags": [
+        "tutorial",
+        "butterflies",
+        "flowers"
+      ]
+    }
+  ]
+}