linkml · cmungall · Jun 21, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ xmltodict = "^0.13.0"
 click-default-group = "^1.2.4"
 linkml-runtime = "^1.7.2"
 duckdb = "^0.10.1"
+duckdb-engine = "^0.11.2"
 
 
 [tool.poetry.dev-dependencies]

diff --git a/schema_automator/cli.py b/schema_automator/cli.py
@@ -242,7 +242,7 @@ def import_dosdps(dpfiles, output, **args):
 @schema_name_option
 def import_sql(db, output, **args):
     """
-    Imports a schema by introspecting a relational database
+    Imports a schema by introspecting a relational database.
 
     See :ref:`importers` for more on the importers framework
     """
@@ -297,13 +297,14 @@ def import_htmltable(url, output, class_name, schema_name, columns,
 @click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum')
 @click.option('--omit-null/--no-omit-null', default=False, help="if true, ignore null values")
 @click.option('--inlined-map', multiple=True, help="SLOT_NAME.KEY pairs indicating which slots are inlined as dict")
+@click.option('--index-slot', help="slot to inject for lists of objects")
 @click.option('--depluralize/--no-depluralized',
               default=True,
               show_default=True,
               help="Auto-depluralize class names to singular form")
 def generalize_json(input, output, schema_name, depluralize: bool, format, omit_null, inlined_map, **kwargs):
     """
-    Generalizes from a JSON file to a schema
+    Generalizes from a JSON (or YAML) file to a schema
 
     See :ref:`generalizers` for more on the generalization framework
 

diff --git a/schema_automator/generalizers/csv_data_generalizer.py b/schema_automator/generalizers/csv_data_generalizer.py
@@ -264,17 +264,22 @@ def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition:
         self.inject_foreign_keys(sv, fks)
         return sv.schema
 
-    def convert(self, file: str, **kwargs) -> SchemaDefinition:
+    def convert(self, file: str, delimiter=None, **kwargs) -> SchemaDefinition:
         """
         Converts a single TSV file to a single-class schema
 
         :param file:
         :param kwargs:
         :return:
         """
+        if delimiter is None:
+            if file.endswith(".csv"):
+                delimiter = ","
+            else:
+                delimiter = self.column_separator
         with open(file, newline='', encoding='utf-8') as tsv_file:
-            header = [h.strip() for h in tsv_file.readline().split('\t')]
-            rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=self.column_separator, skipinitialspace=False)
+            header = [h.strip() for h in tsv_file.readline().split(delimiter)]
+            rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=delimiter, skipinitialspace=False)
             return self.convert_dicts([r for r in rr], **kwargs)
 
     def convert_from_dataframe(self, df: pd.DataFrame, **kwargs) -> SchemaDefinition:

diff --git a/schema_automator/generalizers/json_instance_generalizer.py b/schema_automator/generalizers/json_instance_generalizer.py
@@ -36,13 +36,14 @@ class JsonDataGeneralizer(Generalizer):
 
 
     def convert(self, input: Union[str, Dict], format: str = 'json',
+                index_slot: str = None,
                 container_class_name='Container',
                 **kwargs) -> SchemaDefinition:
         """
         Generalizes from a JSON file
 
         :param input:
-        :param format:
+        :param format: json or yaml; use yaml_multi for multiple documents
         :param container_class_name:
         :param kwargs:
         :return:
@@ -62,13 +63,19 @@ def convert(self, input: Union[str, Dict], format: str = 'json',
                     obj = json.load(stream)
                 elif format == 'yaml':
                     obj = yaml.safe_load(stream)
+                elif format == 'yaml_multi':
+                    obj = list(yaml.safe_load_all(stream))
                 elif format == 'toml':
                     obj_str = "".join(stream.readlines())
                     toml_obj = tomlkit.parse(obj_str)
                     json_str = json.dumps(toml_obj)
                     obj = json.loads(json_str)
                 else:
                     raise Exception(f'bad format {format}')
+        if isinstance(obj, list):
+            if index_slot is None:
+                index_slot = 'members'
+            obj = {index_slot: obj}
         rows_by_table = defaultdict(list)
         self.rows_by_table = rows_by_table
         self._convert_obj(obj, table=container_class_name)

diff --git a/schema_automator/importers/fhir_codesystem_import_engine.py b/schema_automator/importers/fhir_codesystem_import_engine.py
@@ -0,0 +1,56 @@
+import json
+from typing import Dict, Any
+
+from linkml_runtime.linkml_model import SchemaDefinition, EnumDefinition, PermissibleValue
+from schema_automator.importers.import_engine import ImportEngine
+
+class FHIRCodeSystemImportEngine(ImportEngine):
+    def load(self, input: str) -> SchemaDefinition:
+        # Parse the JSON input
+        data = json.loads(input)
+
+        # Create a new SchemaDefinition
+        schema = SchemaDefinition(
+            name=data.get('name', 'FHIRCodeSystem'),
+            id=data.get('url', 'http://example.org/FHIRCodeSystem')
+        )
+
+        # Define the Enum for the CodeSystem
+        code_system_enum = EnumDefinition(
+            name='CodeSystemEnum',
+            description=data.get('description', 'A FHIR CodeSystem resource')
+        )
+
+        # Process the concepts and create permissible values
+        if 'concept' in data:
+            code_system_enum.permissible_values = self._process_concepts(data['concept'])
+
+        # Add the Enum to the schema
+        schema.enums = {
+            'CodeSystemEnum': code_system_enum
+        }
+
+        return schema
+
+    def _process_concepts(self, concepts: Dict[str, Any]) -> Dict[str, PermissibleValue]:
+        permissible_values = {}
+
+        for concept in concepts:
+            code = concept['code']
+            pv = PermissibleValue(
+                text=code,
+                title=concept.get('display', None),
+                description=concept.get('definition', None),
+            )
+
+            # Check for parent relationships in properties
+            for prop in concept.get('property', []):
+                if prop['code'] == 'subsumedBy':
+                    pv.is_a = prop['valueCode']
+                if prop['code'] == 'status':
+                    pv.status = prop['valueCode']
+
+            permissible_values[code] = pv
+
+        return permissible_values
+
diff --git a/tests/test_importers/test_fhir_codesystem_importer.py b/tests/test_importers/test_fhir_codesystem_importer.py
@@ -0,0 +1,25 @@
+import os
+
+from linkml.generators import PythonGenerator
+from linkml_runtime import SchemaView
+
+from schema_automator.importers.fhir_codesystem_import_engine import FHIRCodeSystemImportEngine
+from schema_automator.utils import write_schema
+from tests import INPUT_DIR, OUTPUT_DIR
+
+INPUT_JSON = os.path.join(INPUT_DIR, "CodeSystem-v3-RoleCode.json")
+OUT = os.path.join(OUTPUT_DIR, "CodeSystem-v3-RoleCode.linkml.yaml")
+
+
+def test_fhir_code_system_import():
+    with open(INPUT_JSON, "r", encoding="utf-8") as f:
+        input_data = f.read()
+
+    ie = FHIRCodeSystemImportEngine()
+    schema = ie.load(input_data)
+    assert schema
+    write_schema(schema, OUT)
+
+    py_str = PythonGenerator(OUT).serialize()
+    assert py_str
+    _sv = SchemaView(schema)