Addressing MedGen conflicts

Addressing issues with our mappings to MedGen. See: monarch-initiative/mondo-ingest#273 - Update: mondo-edit.obo: Removed xrefs flagged as incorrect - Add: templates/ROBOT_addMedGen.tsv - Add: Temp: scripts/collate_bad_medgen_xrefs.py - Update: Temp: mondo.Makefile: Added goals: address-medgen-conflicts, tmp/bad-medgen-xrefs.txt, tmp/July2023_CUIReports_FromMedGentoMondo.xlsx
monarch-initiative · Aug 15, 2023 · a82aa2e · a82aa2e
1 parent d0658fb
commit a82aa2e
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 1 deletion.
diff --git a/src/ontology/mondo.Makefile b/src/ontology/mondo.Makefile
@@ -264,7 +264,20 @@ reports/%-rare-diseases.tsv: $(ONT)-base.owl reports/%-rare-diseases.txt
 rare-disease-reports: reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv
 	python ../scripts/filter_rare_disease_list.py reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv reports/added-rare-disases.tsv reports/removed-rare-diseases.tsv
 
-
+#############################################
+##### One-time scripts ######################
+#############################################
+
+tmp/July2023_CUIReports_FromMedGentoMondo.xlsx:
+	mkdir -p tmp && wget "https://github.com/monarch-initiative/mondo-ingest/files/12029712/July2023_CUIReports_FromMedGentoMondo.xlsx" -O $@
+
+tmp/bad-medgen-xrefs.txt: tmp/July2023_CUIReports_FromMedGentoMondo.xlsx
+	python ../scripts/collate_bad_medgen_xrefs.py -i tmp/July2023_CUIReports_FromMedGentoMondo.xlsx -o $@
+
+# address-medgen-conflicts: https://github.com/monarch-initiative/mondo-ingest/issues/273
+.PHONY: address-medgen-conflicts
+address-medgen-conflicts: tmp/bad-medgen-xrefs.txt
+	echo 3. grep -v on mondo-edit.obo
 
 
 #############################################

diff --git a/src/scripts/collate_bad_medgen_xrefs.py b/src/scripts/collate_bad_medgen_xrefs.py
@@ -0,0 +1,85 @@
+"""Collate bad medgen xrefs
+
+From excel file containing workseets about MedGen conflicts, create a text file that lists all the bad CUIs for xrefs
+that need to be removed from Mondo.
+
+See also:
+- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273
+- Megan Kane's source files including readme.txt that goes over each sheet:
+  https://github.com/monarch-initiative/mondo-ingest/issues/273#issuecomment-1632774012
+
+
+Prerequisites:
+1. Python package 'openpyxl' to read .xlsx (left out of requirements.txt since this is a temporary script)
+"""
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Set, Union
+
+import pandas as pd
+
+SCRIPTS_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPTS_DIR.parent.parent
+TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp"
+INPUT_FILE = str(TMP_DIR / "July2023_CUIReports_FromMedGentoMondo.xlsx")
+OUTPUT_FILE = str(TMP_DIR / "bad-medgen-xrefs.txt")
+
+
+def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
+    """Run analysis and generate output"""
+    book = pd.ExcelFile(input_file)
+    bad_xref_cuis: Set[Union[str, int]] = set()
+
+    # Sheet 1/5: Bad_CNxRefs_withCUI
+    # - summary: MedGen team has suggested us better mappings to use on these Mondo IDs.
+    # - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad".
+    df = book.parse("Bad_CNxRefs_withCUI")
+    bad_xref_cuis.update(df['mondo_xref_bad'].tolist())
+
+    # Sheet 2/5: WrongCUIxref_withCurrCUI
+    # - summary: MedGen team has suggested us better mappings to use on these Mondo IDs.
+    # - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad".
+    df = book.parse("WrongCUIxref_withCurrCUI")
+    bad_xref_cuis.update(df['mondo_xref_bad'].tolist())
+
+    # Sheet 3/5: 1MondoID_to1CUI
+    # - summary: MedGen has identified better Mondo IDs to map to these CUIs. 
+    # - recommended action: For terms in the "mondo_id(Mondo)" column, remove xrefs identified in the "UMLS_CUI" column.
+    df = book.parse(">1MondoID_to1CUI")
+    bad_xref_cuis.update(df['UMLS_CUI'].tolist())
+
+    # Sheet 4/5: Mondo_MedGen_CUImismatch_1Mondo
+    # - summary: We have Mondo terms that have xrefs to multiple MedGen CUIs. Only 1 of them should be left in.
+    # - recommended action: Remove incorrect xrefs; the ones not identified in the "MedGenCUI" column.
+    df = book.parse("Mondo_MedGen_CUImismatch_1Mondo")
+    to_remove = list(df.apply(
+        lambda row: [x for x in row['medgen_query'].split(' OR ')
+                     if 'MONDO' not in x
+                     and x != row['MedGenCUI']][0], axis=1))
+    bad_xref_cuis.update(to_remove)
+
+    # Sheet 5/5: MondoXrefs_NotinMedGen
+    # - summary: These terms do not exist in MedGen.
+    # - recommended action: Remove these xrefs.
+    df = book.parse("MondoXrefs_NotinMedGen")
+    bad_xref_cuis.update(df['ref_target'].tolist())
+
+    bad_xref_cuis: Set[str] = set([str(x) for x in bad_xref_cuis])
+    out_df = pd.DataFrame(bad_xref_cuis, columns=['bad_xref_cuis']).sort_values('bad_xref_cuis')
+    out_df.to_csv(output_file, index=False, header=False)
+
+def cli():
+    """Command line interface."""
+    parser = ArgumentParser(
+        prog='Collate bad medgen xrefs',
+        description='From excel file containing workseets about MedGen conflicts, create a text file that lists all '
+                    'the bad CUIs for xrefs that need to be removed from Mondo.')
+    parser.add_argument(
+        '-i', '--input-file', default=INPUT_FILE, help='Excel file containing conflitcs sheets')
+    parser.add_argument(
+        '-o', '--output-file', default=OUTPUT_FILE, help='Txt file to contain CUIs for bad xrefs')
+    run(**vars(parser.parse_args()))
+
+
+if __name__ == '__main__':
+    cli()