From b854dbcb320e297aa5edd02b605ff27185a66ad9 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Thu, 20 Jun 2024 18:22:00 -0700 Subject: [PATCH] Adding FHIR CodeSystem importer --- poetry.lock | 19 ++++++- pyproject.toml | 1 + schema_automator/cli.py | 5 +- .../generalizers/csv_data_generalizer.py | 11 +++- .../generalizers/json_instance_generalizer.py | 9 ++- .../fhir_codesystem_import_engine.py | 56 +++++++++++++++++++ .../test_fhir_codesystem_importer.py | 25 +++++++++ 7 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 schema_automator/importers/fhir_codesystem_import_engine.py create mode 100644 tests/test_importers/test_fhir_codesystem_importer.py diff --git a/poetry.lock b/poetry.lock index 7b87a86..dd1f578 100644 --- a/poetry.lock +++ b/poetry.lock @@ -893,6 +893,22 @@ files = [ {file = "duckdb-0.10.1.tar.gz", hash = "sha256:0d5b6daa9bb54a635e371798994caa08f26d2f145ebcbc989e16b0a0104e84fb"}, ] +[[package]] +name = "duckdb-engine" +version = "0.11.2" +description = "SQLAlchemy driver for duckdb" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "duckdb_engine-0.11.2-py3-none-any.whl", hash = "sha256:786a9a14b56297d8b98ec6213dce79f0bdb44129bf5faa5792e0fa24f290e7a0"}, + {file = "duckdb_engine-0.11.2.tar.gz", hash = "sha256:40644334a0af02bdb50bbd8c57e4bd29441e7bf9bd21b565848645bae318e533"}, +] + +[package.dependencies] +duckdb = ">=0.4.0" +sqlalchemy = ">=1.3.22" + [[package]] name = "editorconfig" version = "0.12.4" @@ -4177,6 +4193,7 @@ category = "main" optional = false python-versions = "*" files = [ + {file = "PyTrie-0.4.0-py3-none-any.whl", hash = "sha256:f687c224ee8c66cda8e8628a903011b692635ffbb08d4b39c5f92b18eb78c950"}, {file = "PyTrie-0.4.0.tar.gz", hash = "sha256:8f4488f402d3465993fb6b6efa09866849ed8cda7903b50647b7d0342b805379"}, ] @@ -6243,4 +6260,4 @@ mariadb = [] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "6e3ed517c6397fb41ecd6e3e1b3183402f5900c297ae9d90baa2440ca6940702" +content-hash = "881055daa9017c49e821618c553b33eeae2b7a8fb32794d532b5b96ac17af859" diff --git a/pyproject.toml b/pyproject.toml index b03f8f9..eb227bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ xmltodict = "^0.13.0" click-default-group = "^1.2.4" linkml-runtime = "^1.7.2" duckdb = "^0.10.1" +duckdb-engine = "^0.11.2" [tool.poetry.dev-dependencies] diff --git a/schema_automator/cli.py b/schema_automator/cli.py index 18c33ef..5f7bba9 100644 --- a/schema_automator/cli.py +++ b/schema_automator/cli.py @@ -242,7 +242,7 @@ def import_dosdps(dpfiles, output, **args): @schema_name_option def import_sql(db, output, **args): """ - Imports a schema by introspecting a relational database + Imports a schema by introspecting a relational database. See :ref:`importers` for more on the importers framework """ @@ -297,13 +297,14 @@ def import_htmltable(url, output, class_name, schema_name, columns, @click.option('--enum-threshold', default=0.1, help='if the number of distinct values / rows is less than this, do not make an enum') @click.option('--omit-null/--no-omit-null', default=False, help="if true, ignore null values") @click.option('--inlined-map', multiple=True, help="SLOT_NAME.KEY pairs indicating which slots are inlined as dict") +@click.option('--index-slot', help="slot to inject for lists of objects") @click.option('--depluralize/--no-depluralized', default=True, show_default=True, help="Auto-depluralize class names to singular form") def generalize_json(input, output, schema_name, depluralize: bool, format, omit_null, inlined_map, **kwargs): """ - Generalizes from a JSON file to a schema + Generalizes from a JSON (or YAML) file to a schema See :ref:`generalizers` for more on the generalization framework diff --git a/schema_automator/generalizers/csv_data_generalizer.py b/schema_automator/generalizers/csv_data_generalizer.py index 2f8c196..864e24c 100644 --- a/schema_automator/generalizers/csv_data_generalizer.py +++ b/schema_automator/generalizers/csv_data_generalizer.py @@ -264,7 +264,7 @@ def convert_multiple(self, files: List[str], **kwargs) -> SchemaDefinition: self.inject_foreign_keys(sv, fks) return sv.schema - def convert(self, file: str, **kwargs) -> SchemaDefinition: + def convert(self, file: str, delimiter=None, **kwargs) -> SchemaDefinition: """ Converts a single TSV file to a single-class schema @@ -272,9 +272,14 @@ def convert(self, file: str, **kwargs) -> SchemaDefinition: :param kwargs: :return: """ + if delimiter is None: + if file.endswith(".csv"): + delimiter = "," + else: + delimiter = self.column_separator with open(file, newline='', encoding='utf-8') as tsv_file: - header = [h.strip() for h in tsv_file.readline().split('\t')] - rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=self.column_separator, skipinitialspace=False) + header = [h.strip() for h in tsv_file.readline().split(delimiter)] + rr = csv.DictReader(tsv_file, fieldnames=header, delimiter=delimiter, skipinitialspace=False) return self.convert_dicts([r for r in rr], **kwargs) def convert_from_dataframe(self, df: pd.DataFrame, **kwargs) -> SchemaDefinition: diff --git a/schema_automator/generalizers/json_instance_generalizer.py b/schema_automator/generalizers/json_instance_generalizer.py index 105fc76..f6cc1a6 100644 --- a/schema_automator/generalizers/json_instance_generalizer.py +++ b/schema_automator/generalizers/json_instance_generalizer.py @@ -36,13 +36,14 @@ class JsonDataGeneralizer(Generalizer): def convert(self, input: Union[str, Dict], format: str = 'json', + index_slot: str = None, container_class_name='Container', **kwargs) -> SchemaDefinition: """ Generalizes from a JSON file :param input: - :param format: + :param format: json or yaml; use yaml_multi for multiple documents :param container_class_name: :param kwargs: :return: @@ -62,6 +63,8 @@ def convert(self, input: Union[str, Dict], format: str = 'json', obj = json.load(stream) elif format == 'yaml': obj = yaml.safe_load(stream) + elif format == 'yaml_multi': + obj = list(yaml.safe_load_all(stream)) elif format == 'toml': obj_str = "".join(stream.readlines()) toml_obj = tomlkit.parse(obj_str) @@ -69,6 +72,10 @@ def convert(self, input: Union[str, Dict], format: str = 'json', obj = json.loads(json_str) else: raise Exception(f'bad format {format}') + if isinstance(obj, list): + if index_slot is None: + index_slot = 'members' + obj = {index_slot: obj} rows_by_table = defaultdict(list) self.rows_by_table = rows_by_table self._convert_obj(obj, table=container_class_name) diff --git a/schema_automator/importers/fhir_codesystem_import_engine.py b/schema_automator/importers/fhir_codesystem_import_engine.py new file mode 100644 index 0000000..12ead25 --- /dev/null +++ b/schema_automator/importers/fhir_codesystem_import_engine.py @@ -0,0 +1,56 @@ +import json +from typing import Dict, Any + +from linkml_runtime.linkml_model import SchemaDefinition, EnumDefinition, PermissibleValue +from schema_automator.importers.import_engine import ImportEngine + +class FHIRCodeSystemImportEngine(ImportEngine): + def load(self, input: str) -> SchemaDefinition: + # Parse the JSON input + data = json.loads(input) + + # Create a new SchemaDefinition + schema = SchemaDefinition( + name=data.get('name', 'FHIRCodeSystem'), + id=data.get('url', 'http://example.org/FHIRCodeSystem') + ) + + # Define the Enum for the CodeSystem + code_system_enum = EnumDefinition( + name='CodeSystemEnum', + description=data.get('description', 'A FHIR CodeSystem resource') + ) + + # Process the concepts and create permissible values + if 'concept' in data: + code_system_enum.permissible_values = self._process_concepts(data['concept']) + + # Add the Enum to the schema + schema.enums = { + 'CodeSystemEnum': code_system_enum + } + + return schema + + def _process_concepts(self, concepts: Dict[str, Any]) -> Dict[str, PermissibleValue]: + permissible_values = {} + + for concept in concepts: + code = concept['code'] + pv = PermissibleValue( + text=code, + title=concept.get('display', None), + description=concept.get('definition', None), + ) + + # Check for parent relationships in properties + for prop in concept.get('property', []): + if prop['code'] == 'subsumedBy': + pv.is_a = prop['valueCode'] + if prop['code'] == 'status': + pv.status = prop['valueCode'] + + permissible_values[code] = pv + + return permissible_values + diff --git a/tests/test_importers/test_fhir_codesystem_importer.py b/tests/test_importers/test_fhir_codesystem_importer.py new file mode 100644 index 0000000..2c8344e --- /dev/null +++ b/tests/test_importers/test_fhir_codesystem_importer.py @@ -0,0 +1,25 @@ +import os + +from linkml.generators import PythonGenerator +from linkml_runtime import SchemaView + +from schema_automator.importers.fhir_codesystem_import_engine import FHIRCodeSystemImportEngine +from schema_automator.utils import write_schema +from tests import INPUT_DIR, OUTPUT_DIR + +INPUT_JSON = os.path.join(INPUT_DIR, "CodeSystem-v3-RoleCode.json") +OUT = os.path.join(OUTPUT_DIR, "CodeSystem-v3-RoleCode.linkml.yaml") + + +def test_fhir_code_system_import(): + with open(INPUT_JSON, "r", encoding="utf-8") as f: + input_data = f.read() + + ie = FHIRCodeSystemImportEngine() + schema = ie.load(input_data) + assert schema + write_schema(schema, OUT) + + py_str = PythonGenerator(OUT).serialize() + assert py_str + _sv = SchemaView(schema) \ No newline at end of file