Addressing MedGen conflicts

Addressing issues with our mappings to MedGen. See: monarch-initiative/mondo-ingest#273 - Add: scripts/: scripts to help with addressing these MedGen issues and also running analysis - Update: mondo.Makefile: Added goals: Several goals to do the update and also analyze it - Update: requirements.txt files: openpyxl
monarch-initiative · Aug 17, 2023 · d6ec8ae · d6ec8ae
1 parent d0658fb
commit d6ec8ae
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 2 deletions.
diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt
@@ -1,2 +1,3 @@
+openpyxl
 pandas
 pyyaml
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,9 @@
 backports.entry-points-selectable==1.1.0
 distlib==0.3.3
+et-xmlfile==1.1.0
 filelock==3.2.0
 numpy==1.23.2
+openpyxl==3.1.2
 pandas==1.4.3
 pbr==5.6.0
 platformdirs==2.4.0
@@ -12,4 +14,3 @@ six==1.16.0
 stevedore==3.4.0
 virtualenv==20.8.1
 virtualenv-clone==0.5.7
-virtualenvwrapper==4.8.4
diff --git a/src/ontology/mondo.Makefile b/src/ontology/mondo.Makefile
@@ -264,7 +264,35 @@ reports/%-rare-diseases.tsv: $(ONT)-base.owl reports/%-rare-diseases.txt
 rare-disease-reports: reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv
 	python ../scripts/filter_rare_disease_list.py reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv reports/added-rare-disases.tsv reports/removed-rare-diseases.tsv
 
-
+#############################################
+##### One-time scripts ######################
+#############################################
+# address-medgen-conflicts-aug2023 pipeline
+# - https://github.com/monarch-initiative/mondo-ingest/issues/273
+tmp/July2023_CUIReports_FromMedGentoMondo.xlsx:
+	mkdir -p tmp && wget "https://github.com/monarch-initiative/mondo-ingest/files/12029712/July2023_CUIReports_FromMedGentoMondo.xlsx" -O $@
+
+tmp/bad-medgen-xrefs.txt: tmp/July2023_CUIReports_FromMedGentoMondo.xlsx
+	python ../scripts/bad_medgen_xrefs_collate.py -i tmp/July2023_CUIReports_FromMedGentoMondo.xlsx -o $@
+
+tmp/bad-medgen-xrefs-grep-command.sh: tmp/bad-medgen-xrefs.txt
+	python ../scripts/bad_medgen_xrefs_grep_command.py -i tmp/bad-medgen-xrefs.txt -t mondo-edit.obo -o $@
+
+mondo-edit.obo.tmp: tmp/bad-medgen-xrefs-grep-command.sh
+	sh $<
+
+tmp/mondo-edit.obo.tmp.diff: mondo-edit.obo.tmp
+	-diff mondo-edit.obo mondo-edit.obo.tmp > $@
+
+tmp/report-qc-medgen-conflicts-update-diff.tsv: tmp/mondo-edit.obo.tmp.diff
+	python ../scripts/bad_medgen_xrefs_update_analyze_diff.py -i tmp/mondo-edit.obo.tmp.diff -o $@
+
+.PHONY: address-medgen-conflicts-aug2023
+address-medgen-conflicts-aug2023: tmp/report-qc-medgen-conflicts-update-diff.tsv
+	grep -v 'source="UMLS:CN' mondo-edit.obo.tmp > mondo-edit.obo.tmp2.tmp
+	grep -v MEDGEN: mondo-edit.obo.tmp2.tmp > mondo-edit.obo.tmp3.tmp
+	mv mondo-edit.obo.tmp3.tmp mondo-edit.obo
+	rm mondo-edit.obo.tmp2.tmp mondo-edit.obo.tmp
 
 
 #############################################

diff --git a/src/scripts/bad_medgen_xrefs_collate.py b/src/scripts/bad_medgen_xrefs_collate.py
@@ -0,0 +1,88 @@
+"""Collate bad medgen xrefs
+
+From excel file containing workseets about MedGen conflicts, create a text file that lists all the bad CUIs for xrefs
+that need to be removed from Mondo.
+
+See also:
+- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273
+- Megan Kane's source files including readme.txt that goes over each sheet:
+  https://github.com/monarch-initiative/mondo-ingest/issues/273#issuecomment-1632774012
+
+
+Prerequisites:
+1. Python package 'openpyxl' to read .xlsx (left out of requirements.txt since this is a temporary script)
+"""
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import List, Set, Union
+
+import pandas as pd
+
+SCRIPTS_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPTS_DIR.parent.parent
+TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp"
+INPUT_FILE = str(TMP_DIR / "July2023_CUIReports_FromMedGentoMondo.xlsx")
+OUTPUT_FILE = str(TMP_DIR / "bad-medgen-xrefs.txt")
+
+
+def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
+    """Run analysis and generate output"""
+    book = pd.ExcelFile(input_file)
+    bad_xref_cuis: Set[Union[str, int]] = set()
+
+    # Sheet 1/5: Bad_CNxRefs_withCUI
+    # - summary: MedGen team has suggested us better mappings to use on these Mondo IDs.
+    # - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad".
+    df = book.parse("Bad_CNxRefs_withCUI")
+    bad_xref_cuis.update(df['mondo_xref_bad'].tolist())
+
+    # Sheet 2/5: WrongCUIxref_withCurrCUI
+    # - summary: MedGen team has suggested us better mappings to use on these Mondo IDs.
+    # - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad".
+    df = book.parse("WrongCUIxref_withCurrCUI")
+    bad_xref_cuis.update(df['mondo_xref_bad'].tolist())
+
+    # Sheet 3/5: 1MondoID_to1CUI
+    # - summary: MedGen has identified better Mondo IDs to map to these CUIs. 
+    # - recommended action: For terms in the "mondo_id(Mondo)" column, remove xrefs identified in the "UMLS_CUI" column.
+    df = book.parse(">1MondoID_to1CUI")
+    bad_xref_cuis.update(df['UMLS_CUI'].tolist())
+
+    # Sheet 4/5: Mondo_MedGen_CUImismatch_1Mondo
+    # - summary: We have Mondo terms that have xrefs to multiple MedGen CUIs. Only 1 of them should be left in.
+    # - recommended action: Remove incorrect xrefs; the ones not identified in the "MedGenCUI" column.
+    df = book.parse("Mondo_MedGen_CUImismatch_1Mondo")
+    to_remove = list(df.apply(
+        lambda row: [x for x in row['medgen_query'].split(' OR ')
+                     if 'MONDO' not in x
+                     and x != row['MedGenCUI']][0], axis=1))
+    bad_xref_cuis.update(to_remove)
+
+    # Sheet 5/5: MondoXrefs_NotinMedGen
+    # - summary: These terms do not exist in MedGen.
+    # - recommended action: Remove these xrefs.
+    df = book.parse("MondoXrefs_NotinMedGen")
+    bad_xref_cuis.update(df['ref_target'].tolist())
+
+    # Convert to strings and sort
+    bad_xref_cuis: List[str] = sorted([str(x) for x in bad_xref_cuis])
+
+    # Output: List of CUIs of bad xrefs
+    out_df = pd.DataFrame(bad_xref_cuis, columns=['bad_xref_cuis'])
+    out_df.to_csv(output_file, index=False, header=False)
+
+def cli():
+    """Command line interface."""
+    parser = ArgumentParser(
+        prog='Collate bad medgen xrefs',
+        description='From excel file containing workseets about MedGen conflicts, create a text file that lists all '
+                    'the bad CUIs for xrefs that need to be removed from Mondo.')
+    parser.add_argument(
+        '-i', '--input-file', default=INPUT_FILE, help='Excel file containing conflitcs sheets')
+    parser.add_argument(
+        '-o', '--output-file', default=OUTPUT_FILE, help='Txt file to contain CUIs for bad xrefs')
+    run(**vars(parser.parse_args()))
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/src/scripts/bad_medgen_xrefs_grep_command.py b/src/scripts/bad_medgen_xrefs_grep_command.py
@@ -0,0 +1,40 @@
+"""From a list of bad MedGen xrefs, generate a grep command to remove them from Mondo.
+
+See also:
+- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273
+"""
+from argparse import ArgumentParser
+
+
+def run(input_file: str, output_file: str, target: str):
+    """Create grep command"""
+    bad_xref_cuis = []
+    with open(input_file, "r") as file:
+        for line in file:
+            bad_xref_cuis.append(line.strip())
+
+    output_file2 = output_file.replace('.txt', '-grep-command.sh')
+    command_string = "grep -v "
+    for s in bad_xref_cuis:
+        command_string += f'-e "xref: UMLS:{s}" '
+    command_string += f"{target} > {target + '.tmp'}\n"
+    with open(output_file2, "w") as file:
+        file.write(command_string)
+
+def cli():
+    """Command line interface."""
+    parser = ArgumentParser(
+        prog='Collate bad medgen xrefs',
+        description='From excel file containing workseets about MedGen conflicts, create a text file that lists all '
+                    'the bad CUIs for xrefs that need to be removed from Mondo.')
+    parser.add_argument(
+        '-i', '--input-file', help='Txt file to contain CUIs for bad xrefs')
+    parser.add_argument(
+        '-o', '--output-file', help='Txt file to contain CUIs for bad xrefs')
+    parser.add_argument(
+        '-t', '--target', help='Target file to use grep on')
+    run(**vars(parser.parse_args()))
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/src/scripts/bad_medgen_xrefs_update_analyze_diff.py b/src/scripts/bad_medgen_xrefs_update_analyze_diff.py
@@ -0,0 +1,87 @@
+"""MedGen conflict updates QC analysis
+
+QC analysis for updates to mondo-edit.obo concerning removal of xrefs due to MedGen conflicts.
+
+See also:
+- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273
+"""
+from argparse import ArgumentParser
+from collections import Counter
+from pathlib import Path
+
+import pandas as pd
+
+SCRIPTS_DIR = Path(__file__).parent
+PROJECT_DIR = SCRIPTS_DIR.parent.parent
+TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp"
+INPUT_FILE = str(TMP_DIR / "mondo-edit.obo.tmp.diff")
+OUTPUT_FILE = str(TMP_DIR / "report-qc-medgen-conflicts-update-diff.tsv")
+
+
+def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
+    """Run analysis and generate output"""
+    report = {}
+
+    # Read diff
+    with open(input_file) as f:
+        lines = f.readlines()
+
+    # Remove hashes, leaving only the edited lines themselves
+    lines = [l for l in lines if l.startswith('<') or l.startswith('>')]
+
+    # Report: How many lines added/removed
+    report['n_lines_removed'] = len([x for x in lines if x.startswith('<')])
+    # report['n_lines_added'] = len([x for x in lines if x.startswith('>')])  # 0
+
+    # Remove '< ' / '> '
+    lines = [x[2:] for x in lines]
+
+    # Report: How many xrefs / non-xrefs removed
+    xref_removals = [l for l in lines if l.startswith('xref')]
+    report['n_xrefs_removed'] = len(xref_removals)
+    non_xref_removals = [l for l in lines if not l.startswith('xref')]
+    report['n_non_xrefs_removals'] = len(non_xref_removals)
+
+    # Report: Non-xref removals detailes
+    report['non_xref_removals_content'] = ''
+    for l in non_xref_removals:
+        report['non_xref_removals_content'] += l
+
+    # Report: Non MedGen xrefs removed that got tagged because UMLS was a source
+    non_umls_removals = [x for x in xref_removals if not x.startswith('xref: UMLS')]
+    report['n_xrefs_removed_cuz_umls_was_a_source'] = len(non_umls_removals)
+
+    # Report: Number of MedGen xrefs removed
+    umls_removals = [x for x in xref_removals if x.startswith('xref: UMLS')]
+    report['n_umls_removals'] = len(umls_removals)
+
+    # Report: MedGen xref deletions on multiple Mondo IDs
+    cui_removal_counts = {}
+    for l in umls_removals:
+        parts = l.split(' ')
+        cui = parts[1]
+        if cui not in cui_removal_counts:
+            cui_removal_counts[cui] = 0
+        cui_removal_counts[cui] += 1
+    n_removal_frequency = Counter(cui_removal_counts.values())
+    for k, v in n_removal_frequency.items():
+        report[f'n_single_cui_removed_from_xrefs_n_times__{k}'] = v
+
+    # Saved
+    df = pd.DataFrame([{'metric_name': k, 'val': v}  for k, v in report.items()])
+    df.to_csv(output_file, sep='\t', index=False)
+
+def cli():
+    """Command line interface."""
+    parser = ArgumentParser(
+        prog='MedGen conflict updates QC analysis',
+        description='QC analysis for updates to mondo-edit.obo concerning removal of xrefs due to MedGen conflicts')
+    parser.add_argument(
+        '-i', '--input-file', default=INPUT_FILE, help='Diff of mondo-edit.obo and mondo-edit.obo.tmp')
+    parser.add_argument(
+        '-o', '--output-file', default=OUTPUT_FILE, help='Analysis results')
+    run(**vars(parser.parse_args()))
+
+
+if __name__ == '__main__':
+    cli()