Skip to content

Commit

Permalink
kwalify importer (#136)
Browse files Browse the repository at this point in the history
* Kwalify importer

* test-files

* Fixed test
  • Loading branch information
cmungall authored Mar 22, 2024
1 parent c743fd0 commit 398e7f9
Show file tree
Hide file tree
Showing 16 changed files with 2,721 additions and 1,251 deletions.
9 changes: 9 additions & 0 deletions docs/packages/importers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ The ``import-json-schema`` command can be used:
schemauto import-json-schema tests/resources/model_card.schema.json
Importing from Kwalify
---------

The ``import-kwalify`` command can be used:

.. code-block::
schemauto import-kwalify tests/resources/test.kwalify.yaml
Importing from OWL
---------

Expand Down
59 changes: 58 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ schemasheets = ">=0.1.24"
xmltodict = "^0.13.0"
click-default-group = "^1.2.4"
linkml-runtime = "^1.7.2"
duckdb = "^0.10.1"


[tool.poetry.dev-dependencies]
Expand Down
26 changes: 26 additions & 0 deletions schema_automator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from schema_automator.importers.dosdp_import_engine import DOSDPImportEngine
from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer
from schema_automator.importers.jsonschema_import_engine import JsonSchemaImportEngine
from schema_automator.importers.kwalify_import_engine import KwalifyImportEngine
from schema_automator.importers.owl_import_engine import OwlImportEngine
from schema_automator.generalizers.rdf_data_generalizer import RdfDataGeneralizer
from schema_automator.importers.rdfs_import_engine import RdfsImportEngine
Expand Down Expand Up @@ -347,6 +348,12 @@ def generalize_toml(input, output, schema_name, omit_null, **kwargs):
@output_option
@schema_name_option
@use_attributes_option
@click.option(
"--is-openapi/--no-is-openapi",
default=False,
show_default=True,
help="If true, use OpenAPI schema style"
)
@click.option("--import-project/--no-import-project",
help="If true, then the input path should be a directory with multiple schema files")
@click.option('--format', '-f', default='json', help='JSON Schema format - yaml or json')
Expand All @@ -370,6 +377,25 @@ def import_json_schema(input, output, import_project: bool, schema_name, format,
ie.import_project(input, output, name=schema_name, format=format)


@main.command()
@click.argument('input')
@output_option
@schema_name_option
@use_attributes_option
def import_kwalify(input, output, schema_name, **kwargs):
"""
Imports from Kwalify Schema to LinkML
See :ref:`importers` for more on the importer framework
Example:
schemauto import-kwalify my/schema/personinfo.kwalify.yaml
"""
ie = KwalifyImportEngine(**kwargs)
schema = ie.convert(input, output, name=schema_name, format=format)
write_schema(schema, output)

@main.command()
@click.argument('input')
@output_option
Expand Down
118 changes: 106 additions & 12 deletions schema_automator/importers/cadsr_import_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""
import logging
import urllib
from typing import Union, Dict, Tuple, List, Any, Optional, Iterable
from typing import Union, Dict, Tuple, List, Any, Optional, Iterable, Iterator

from dataclasses import dataclass

Expand All @@ -19,6 +19,7 @@
from schema_automator.importers.import_engine import ImportEngine
import schema_automator.metamodels.cadsr as cadsr

ID_LABEL_PAIR = Tuple[str, str]

TMAP = {
"DATE": "date",
Expand All @@ -38,6 +39,28 @@
"Floating-point": "float",
}

def extract_concepts(concepts: List[cadsr.Concept]) -> Tuple[ID_LABEL_PAIR, List[str]]:
main = None
rest = []
if not concepts:
raise ValueError("No concepts")
for concept in concepts:
if concept.evsSource != "NCI_CONCEPT_CODE":
continue
id = f"NCIT:{concept.conceptCode.strip()}"
pair = id, concept.longName
if concept.primaryIndicator == "Yes":
if main:
raise ValueError(f"Multiple primary for: {concepts}")
main = pair
else:
rest.append(id)
if not main:
logging.warning(f"No primary, using arbitrary from {rest}")
main = rest[0]
rest = rest[1:]
return main, rest

@dataclass
class CADSRImportEngine(ImportEngine):
"""
Expand Down Expand Up @@ -94,19 +117,30 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
title=cde.preferredName,
description=cde.preferredDefinition,
aliases=[cde.longName],
conforms_to=f"cadsr:DataElement",
source=source,
)
# each data element belongs to a concept
# (may be reused across classes?)
slots[slot.name] = slot
concept = cde.DataElementConcept
concept_name = urllib.parse.quote(camelcase(f"{ctxt} {concept.preferredName}"))
parent_concept_name = urllib.parse.quote(camelcase(concept.longName))
# a concept is linked to a class
objectClass = concept.ObjectClass
# NCIT concepts describing the class
mainConcept, mappings = extract_concepts(objectClass.Concepts)
class_name = objectClass.longName
concept_name = urllib.parse.quote(camelcase(f"{ctxt} {class_name}"))
parent_concept_name = urllib.parse.quote(class_name)
if parent_concept_name not in classes:
parent_cls = ClassDefinition(
name=parent_concept_name,
title=concept.preferredName,
description=concept.preferredDefinition,
title=objectClass.preferredName,
description=objectClass.preferredDefinition,
#aliases=[concept.longName],
class_uri=f"cadsr:{concept.publicId}",
class_uri=f"cadsr:{objectClass.publicId}",
exact_mappings=[mainConcept[0]],
broad_mappings=mappings,
conforms_to=f"cadsr:ObjectClass",
)
classes[parent_concept_name] = parent_cls
if concept_name not in classes:
Expand All @@ -117,14 +151,23 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
aliases=[concept.longName],
class_uri=f"cadsr:{concept.publicId}",
is_a=parent_concept_name,
conforms_to=f"cadsr:DataElementConcept",
)
classes[concept_name] = cls
else:
cls = classes[concept_name]
cls.slots.append(slot.name)
objectClass = concept.ObjectClass
# TODO
# In theory the ObjectClass should link to a general class of utility in NCIT.
# In practice the actual concept may not be so useful. E.g. in 2724331
# "Agent Adverse Event Attribution Name" the DataConcept is
# Agent (C1708) defined as "An active power or cause (as principle,
# substance, physical or biological factor, etc.) that produces a specific effect."
# which is very upper-ontological
#for ocConcept in objectClass.Concepts:
# if ocConcept.evsSource == "NCI_CONCEPT_CODE":
# cls.is_a = f"NCIT:{ocConcept.conceptCode}"
valueDomain = cde.ValueDomain
# TODO
conceptualDomain = valueDomain.ConceptualDomain
pvs = valueDomain.PermissibleValues
if pvs:
Expand All @@ -140,7 +183,7 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
rng = enum_name
for pv in pvs:
# url encode the value to escape symbols like <, >, etc.
pv_value = urllib.parse.quote(pv.value)
pv_value = urllib.parse.quote(pv.value).replace("%20", " ")
tgt_pv = PermissibleValue(
text=pv_value,
title=pv.value,
Expand All @@ -151,9 +194,10 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
tgt_pv.title = vm.preferredName
if not tgt_pv.description:
tgt_pv.description = vm.preferredDefinition
for c in vm.Concepts:
code = c.conceptCode.strip()
tgt_pv.meaning = f"NCIT:{code}"
if vm.Concepts:
mainConcept, mappings = extract_concepts(vm.Concepts)
tgt_pv.meaning = mainConcept[0]
tgt_pv.broad_mappings = mappings
else:
datatype = valueDomain.dataType
rng = TMAP.get(datatype, "string")
Expand All @@ -179,6 +223,56 @@ def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs)
schema.enums = enums
return schema

def as_rows(self, paths: Iterable[str], **kwargs) -> Iterator[Dict]:
for path in paths:
logging.info(f"Loading {path}")
with (open(path) as file):
container: cadsr.DataElementContainer
container = json_loader.load(file, target_class=cadsr.DataElementContainer)
cde = container.DataElement
yield from self._obj_as_rows(cde, path)

def _obj_as_rows(self, e: Union[cadsr.DataElement, cadsr.DataElementConcept, cadsr.Concept, cadsr.Property, cadsr.ObjectClass, cadsr.ConceptualDomain,
cadsr.ValueDomain, cadsr.PermissibleValue, cadsr.ValueMeaning], parent_id: str) -> Iterator[Dict]:
if isinstance(e, cadsr.Concept):
obj = {
"id": e.conceptCode,
"context": e.evsSource,
"longName": e.longName,
}
elif isinstance(e, cadsr.CDEPermissibleValue):
obj = {
"id": e.publicId,
"value": e.value,
"valueDescription": e.valueDescription,
}
else:
obj = {
"id": e.publicId,
"preferredName": e.preferredName,
"context": e.context,
"longName": e.longName,
}
obj["parentId"] = parent_id
obj["type"] = type(e).class_name
id = obj["id"]
yield obj
if isinstance(e, cadsr.DataElement):
yield from self._obj_as_rows(e.DataElementConcept, id)
yield from self._obj_as_rows(e.ValueDomain, id)
elif isinstance(e, cadsr.DataElementConcept):
yield from self._obj_as_rows(e.ObjectClass, id)
yield from self._obj_as_rows(e.Property, id)
yield from self._obj_as_rows(e.ConceptualDomain, id)
elif isinstance(e, cadsr.ValueDomain):
for pv in e.PermissibleValues:
yield from self._obj_as_rows(pv.ValueMeaning, id)
if isinstance(e, (cadsr.ObjectClass, cadsr.Property, cadsr.PermissibleValue)):
for c in e.Concepts:
yield from self._obj_as_rows(c, id)






Loading

0 comments on commit 398e7f9

Please sign in to comment.