diff --git a/src/ontology/mondo.Makefile b/src/ontology/mondo.Makefile index 4fae69a339..64f21233cc 100644 --- a/src/ontology/mondo.Makefile +++ b/src/ontology/mondo.Makefile @@ -264,7 +264,20 @@ reports/%-rare-diseases.tsv: $(ONT)-base.owl reports/%-rare-diseases.txt rare-disease-reports: reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv python ../scripts/filter_rare_disease_list.py reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv reports/added-rare-disases.tsv reports/removed-rare-diseases.tsv - +############################################# +##### One-time scripts ###################### +############################################# + +tmp/July2023_CUIReports_FromMedGentoMondo.xlsx: + mkdir -p tmp && wget "https://github.com/monarch-initiative/mondo-ingest/files/12029712/July2023_CUIReports_FromMedGentoMondo.xlsx" -O $@ + +tmp/bad-medgen-xrefs.txt: tmp/July2023_CUIReports_FromMedGentoMondo.xlsx + python ../scripts/collate_bad_medgen_xrefs.py -i tmp/July2023_CUIReports_FromMedGentoMondo.xlsx -o $@ + +# address-medgen-conflicts: https://github.com/monarch-initiative/mondo-ingest/issues/273 +.PHONY: address-medgen-conflicts +address-medgen-conflicts: tmp/bad-medgen-xrefs.txt + echo 3. grep -v on mondo-edit.obo ############################################# diff --git a/src/scripts/collate_bad_medgen_xrefs.py b/src/scripts/collate_bad_medgen_xrefs.py new file mode 100644 index 0000000000..113f13d151 --- /dev/null +++ b/src/scripts/collate_bad_medgen_xrefs.py @@ -0,0 +1,85 @@ +"""Collate bad medgen xrefs + +From excel file containing workseets about MedGen conflicts, create a text file that lists all the bad CUIs for xrefs +that need to be removed from Mondo. + +See also: +- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273 +- Megan Kane's source files including readme.txt that goes over each sheet: + https://github.com/monarch-initiative/mondo-ingest/issues/273#issuecomment-1632774012 + + +Prerequisites: +1. Python package 'openpyxl' to read .xlsx (left out of requirements.txt since this is a temporary script) +""" +from argparse import ArgumentParser +from pathlib import Path +from typing import Set, Union + +import pandas as pd + +SCRIPTS_DIR = Path(__file__).parent +PROJECT_DIR = SCRIPTS_DIR.parent.parent +TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp" +INPUT_FILE = str(TMP_DIR / "July2023_CUIReports_FromMedGentoMondo.xlsx") +OUTPUT_FILE = str(TMP_DIR / "bad-medgen-xrefs.txt") + + +def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE): + """Run analysis and generate output""" + book = pd.ExcelFile(input_file) + bad_xref_cuis: Set[Union[str, int]] = set() + + # Sheet 1/5: Bad_CNxRefs_withCUI + # - summary: MedGen team has suggested us better mappings to use on these Mondo IDs. + # - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad". + df = book.parse("Bad_CNxRefs_withCUI") + bad_xref_cuis.update(df['mondo_xref_bad'].tolist()) + + # Sheet 2/5: WrongCUIxref_withCurrCUI + # - summary: MedGen team has suggested us better mappings to use on these Mondo IDs. + # - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad". + df = book.parse("WrongCUIxref_withCurrCUI") + bad_xref_cuis.update(df['mondo_xref_bad'].tolist()) + + # Sheet 3/5: 1MondoID_to1CUI + # - summary: MedGen has identified better Mondo IDs to map to these CUIs. + # - recommended action: For terms in the "mondo_id(Mondo)" column, remove xrefs identified in the "UMLS_CUI" column. + df = book.parse(">1MondoID_to1CUI") + bad_xref_cuis.update(df['UMLS_CUI'].tolist()) + + # Sheet 4/5: Mondo_MedGen_CUImismatch_1Mondo + # - summary: We have Mondo terms that have xrefs to multiple MedGen CUIs. Only 1 of them should be left in. + # - recommended action: Remove incorrect xrefs; the ones not identified in the "MedGenCUI" column. + df = book.parse("Mondo_MedGen_CUImismatch_1Mondo") + to_remove = list(df.apply( + lambda row: [x for x in row['medgen_query'].split(' OR ') + if 'MONDO' not in x + and x != row['MedGenCUI']][0], axis=1)) + bad_xref_cuis.update(to_remove) + + # Sheet 5/5: MondoXrefs_NotinMedGen + # - summary: These terms do not exist in MedGen. + # - recommended action: Remove these xrefs. + df = book.parse("MondoXrefs_NotinMedGen") + bad_xref_cuis.update(df['ref_target'].tolist()) + + bad_xref_cuis: Set[str] = set([str(x) for x in bad_xref_cuis]) + out_df = pd.DataFrame(bad_xref_cuis, columns=['bad_xref_cuis']).sort_values('bad_xref_cuis') + out_df.to_csv(output_file, index=False, header=False) + +def cli(): + """Command line interface.""" + parser = ArgumentParser( + prog='Collate bad medgen xrefs', + description='From excel file containing workseets about MedGen conflicts, create a text file that lists all ' + 'the bad CUIs for xrefs that need to be removed from Mondo.') + parser.add_argument( + '-i', '--input-file', default=INPUT_FILE, help='Excel file containing conflitcs sheets') + parser.add_argument( + '-o', '--output-file', default=OUTPUT_FILE, help='Txt file to contain CUIs for bad xrefs') + run(**vars(parser.parse_args())) + + +if __name__ == '__main__': + cli()