diff --git a/scivision/catalog/catalog.py b/scivision/catalog/catalog.py index ad883314..49e2fb59 100644 --- a/scivision/catalog/catalog.py +++ b/scivision/catalog/catalog.py @@ -180,6 +180,105 @@ def name_unique_key(cls, entries): return entries +def get_models(): + models_raw = pkgutil.get_data(__name__, "data/models.json") + models = CatalogModels.parse_raw(models_raw) + names = [] + for model_entry in models.entries: + names.append(model_entry["name"]) + return names + + +modelEnumStrings = ((x, x) for x in get_models()) +ModelEnum = Enum('ModelEnum', modelEnumStrings) + + +def get_datasources(): + datasources_raw = pkgutil.get_data(__name__, "data/datasources.json") + datasources = CatalogDatasources.parse_raw(datasources_raw) + names = [] + for datasources_entry in datasources.entries: + names.append(datasources_entry["name"]) + return names + + +datasourceEnumStrings = ((x, x) for x in get_datasources()) +DataEnum = Enum('DataEnum', datasourceEnumStrings) + + +class CatalogProjectEntry(BaseModel, extra="forbid", title="A project catalog entry"): + # tasks, institution and tags are Tuples (rather than Lists) so + # that they are immutable - Tuple is being used as an immutable + # sequence here. This means that these fields are hashable, which + # can be more convenient when included in a dataframe + # (e.g. unique()). Could consider using a Frozenset for these + # fields instead, since duplicates and ordering should not be + # significant. + name: str = Field( + ..., + title="Name", + description="Short, unique name for the project (one or two words, " + "under 20 characters recommended)", + ) + header: str = Field( + ..., + title="Header", + description="Header that will display at the top of the project page", + ) + description: Optional[str] = Field( + None, + title="Description", + description="Short description of the project (that will appear when hovering on the project thumbnail)", + ) + page: str = Field( + None, + title="Page", + description="Markdown formatted content for the project page", + ) + models: Tuple[ModelEnum, ...] = Field( + (), + title="Models", + description="Which models from the scivision catalog are used in the project?", + ) + datasources: Tuple[DataEnum, ...] = Field( + (), + title="Datasources", + description="Which datasources from the scivision catalog are used in the project?", + ) + tasks: Tuple[TaskEnum, ...] = Field( + (), + title="Tasks", + description="Which task (or tasks) do the CV models used in the project perform?", + ) + institution: Tuple[str, ...] = Field( + (), + title="Institution(s)", + description="A list of institutions that produced or are associated with " + "the project (one per item)", + ) + tags: Tuple[str, ...] + + def __getitem__(self, item): + return getattr(self, item) + + +class CatalogProjects(BaseModel, extra="forbid"): + catalog_type: str = "scivision project catalog" + name: str + # Tuple: see comment on CatalogProjectEntry + entries: Tuple[CatalogProjectEntry, ...] + + @validator("entries") + def name_unique_key(cls, entries): + name_counts = Counter([entry['name'] for entry in entries]) + dups = [item for item, count in name_counts.items() if count > 1] + + if dups: + raise ValueError(f"The 'name' field in the project catalog should be unique (duplicates: {dups})") + + return entries + + def _coerce_datasources_catalog( datasources: Union[CatalogDatasources, os.PathLike, None] ) -> CatalogDatasources: @@ -213,7 +312,25 @@ def _coerce_models_catalog( models_raw = pkgutil.get_data(__name__, "data/models.json") return CatalogModels.parse_raw(models_raw) else: - raise TypeError("Cannot load datasource from unsupported type") + raise TypeError("Cannot load model from unsupported type") + + +def _coerce_projects_catalog( + projects: Union[CatalogProjects, os.PathLike, None] +) -> CatalogProjects: + """Returns a CatalogProjects determined from the argument: either the + one passed, or one loaded from a file + """ + if isinstance(projects, CatalogProjects): + return projects + elif isinstance(projects, (bytes, str, os.PathLike)): + projects_raw = Path(projects).read_text() + return CatalogProjects.parse_raw(projects_raw) + elif projects is None: + projects_raw = pkgutil.get_data(__name__, "data/projects.json") + return CatalogProjects.parse_raw(projects_raw) + else: + raise TypeError("Cannot load project from unsupported type") class QueryResult(ABC): @@ -234,7 +351,7 @@ def to_dataframe(self) -> pd.DataFrame: class PandasCatalog: - def __init__(self, datasources=None, models=None): + def __init__(self, datasources=None, models=None, projects=None): super().__init__() if isinstance(datasources, pd.DataFrame): @@ -251,6 +368,12 @@ def __init__(self, datasources=None, models=None): models_cat = _coerce_models_catalog(models) self._models = pd.DataFrame([ent.dict() for ent in models_cat.entries]) + if isinstance(projects, pd.DataFrame): + self._projects = projects + else: + projects_cat = _coerce_projects_catalog(projects) + self._projects = pd.DataFrame([ent.dict() for ent in projects_cat.entries]) + @property def models(self) -> PandasQueryResult: return PandasQueryResult(self._models) @@ -259,6 +382,10 @@ def models(self) -> PandasQueryResult: def datasources(self) -> PandasQueryResult: return PandasQueryResult(self._datasources) + @property + def projects(self) -> PandasQueryResult: + return PandasQueryResult(self._projects) + def _compatible_models(self, datasource) -> PandasQueryResult: models_compatible_format = self._models[ self._models.format == datasource["format"] diff --git a/scivision/catalog/data/projects.json b/scivision/catalog/data/projects.json new file mode 100644 index 00000000..6ed64297 --- /dev/null +++ b/scivision/catalog/data/projects.json @@ -0,0 +1,31 @@ +{ + "catalog_type": "scivision project catalog", + "name": "default", + "entries": [ + { + "name": "sept22-connections-workshop", + "header": "Turing Connections Workshop", + "description": "Classification challenge for datasets of Flower and Butterfly images from Kaggle", + "page": "PhD candidates attended a workshop hosted by The Alan Turing Institute's Research Engineering Group (REG) to get some hands-on experience working on scivision as a piece of research software. The task involved adding pre-trained models and datasources to the scivision catalog. Succesful submissions include two classification models, for flowers and butterflies, and related datasets.\n#### Notebooks:\n- [Butterflies](https://github.com/scivision-gallery/connections-workshop-examples/blob/main/ButterflyClassification.ipynb)\n- [Flowers](https://github.com/scivision-gallery/connections-workshop-examples/blob/main/FlowerClassification.ipynb)\n\nTo learn more about the REG at Turing, click [here](https://www.turing.ac.uk/research/research-engineering)", + "tasks": [ + "classification" + ], + "models": [ + "butterfly-classification-model", + "flower-classification-model" + ], + "datasources": [ + "flowers", + "butterflies" + ], + "institution": [ + "alan-turing-institute" + ], + "tags": [ + "tutorial", + "butterflies", + "flowers" + ] + } + ] +} \ No newline at end of file