Skip to content

Commit

Permalink
Addressing MedGen conflicts
Browse files Browse the repository at this point in the history
Addressing issues with our mappings to MedGen. See: monarch-initiative/mondo-ingest#273
- Add: scripts/: scripts to help with addressing these MedGen issues and also running analysis
- Update: mondo.Makefile: Added goals: Several goals to do the update and also analyze it
- Update: requirements.txt files: openpyxl
  • Loading branch information
joeflack4 committed Aug 17, 2023
1 parent d0658fb commit d6ec8ae
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 2 deletions.
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
openpyxl
pandas
pyyaml
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
backports.entry-points-selectable==1.1.0
distlib==0.3.3
et-xmlfile==1.1.0
filelock==3.2.0
numpy==1.23.2
openpyxl==3.1.2
pandas==1.4.3
pbr==5.6.0
platformdirs==2.4.0
Expand All @@ -12,4 +14,3 @@ six==1.16.0
stevedore==3.4.0
virtualenv==20.8.1
virtualenv-clone==0.5.7
virtualenvwrapper==4.8.4
30 changes: 29 additions & 1 deletion src/ontology/mondo.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,35 @@ reports/%-rare-diseases.tsv: $(ONT)-base.owl reports/%-rare-diseases.txt
rare-disease-reports: reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv
python ../scripts/filter_rare_disease_list.py reports/old-rare-diseases.tsv reports/new-rare-diseases.tsv reports/added-rare-disases.tsv reports/removed-rare-diseases.tsv


#############################################
##### One-time scripts ######################
#############################################
# address-medgen-conflicts-aug2023 pipeline
# - https://github.com/monarch-initiative/mondo-ingest/issues/273
tmp/July2023_CUIReports_FromMedGentoMondo.xlsx:
mkdir -p tmp && wget "https://github.com/monarch-initiative/mondo-ingest/files/12029712/July2023_CUIReports_FromMedGentoMondo.xlsx" -O $@

tmp/bad-medgen-xrefs.txt: tmp/July2023_CUIReports_FromMedGentoMondo.xlsx
python ../scripts/bad_medgen_xrefs_collate.py -i tmp/July2023_CUIReports_FromMedGentoMondo.xlsx -o $@

tmp/bad-medgen-xrefs-grep-command.sh: tmp/bad-medgen-xrefs.txt
python ../scripts/bad_medgen_xrefs_grep_command.py -i tmp/bad-medgen-xrefs.txt -t mondo-edit.obo -o $@

mondo-edit.obo.tmp: tmp/bad-medgen-xrefs-grep-command.sh
sh $<

tmp/mondo-edit.obo.tmp.diff: mondo-edit.obo.tmp
-diff mondo-edit.obo mondo-edit.obo.tmp > $@

tmp/report-qc-medgen-conflicts-update-diff.tsv: tmp/mondo-edit.obo.tmp.diff
python ../scripts/bad_medgen_xrefs_update_analyze_diff.py -i tmp/mondo-edit.obo.tmp.diff -o $@

.PHONY: address-medgen-conflicts-aug2023
address-medgen-conflicts-aug2023: tmp/report-qc-medgen-conflicts-update-diff.tsv
grep -v 'source="UMLS:CN' mondo-edit.obo.tmp > mondo-edit.obo.tmp2.tmp
grep -v MEDGEN: mondo-edit.obo.tmp2.tmp > mondo-edit.obo.tmp3.tmp
mv mondo-edit.obo.tmp3.tmp mondo-edit.obo
rm mondo-edit.obo.tmp2.tmp mondo-edit.obo.tmp


#############################################
Expand Down
88 changes: 88 additions & 0 deletions src/scripts/bad_medgen_xrefs_collate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Collate bad medgen xrefs
From excel file containing workseets about MedGen conflicts, create a text file that lists all the bad CUIs for xrefs
that need to be removed from Mondo.
See also:
- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273
- Megan Kane's source files including readme.txt that goes over each sheet:
https://github.com/monarch-initiative/mondo-ingest/issues/273#issuecomment-1632774012
Prerequisites:
1. Python package 'openpyxl' to read .xlsx (left out of requirements.txt since this is a temporary script)
"""
from argparse import ArgumentParser
from pathlib import Path
from typing import List, Set, Union

import pandas as pd

SCRIPTS_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPTS_DIR.parent.parent
TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp"
INPUT_FILE = str(TMP_DIR / "July2023_CUIReports_FromMedGentoMondo.xlsx")
OUTPUT_FILE = str(TMP_DIR / "bad-medgen-xrefs.txt")


def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
"""Run analysis and generate output"""
book = pd.ExcelFile(input_file)
bad_xref_cuis: Set[Union[str, int]] = set()

# Sheet 1/5: Bad_CNxRefs_withCUI
# - summary: MedGen team has suggested us better mappings to use on these Mondo IDs.
# - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad".
df = book.parse("Bad_CNxRefs_withCUI")
bad_xref_cuis.update(df['mondo_xref_bad'].tolist())

# Sheet 2/5: WrongCUIxref_withCurrCUI
# - summary: MedGen team has suggested us better mappings to use on these Mondo IDs.
# - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad".
df = book.parse("WrongCUIxref_withCurrCUI")
bad_xref_cuis.update(df['mondo_xref_bad'].tolist())

# Sheet 3/5: 1MondoID_to1CUI
# - summary: MedGen has identified better Mondo IDs to map to these CUIs.
# - recommended action: For terms in the "mondo_id(Mondo)" column, remove xrefs identified in the "UMLS_CUI" column.
df = book.parse(">1MondoID_to1CUI")
bad_xref_cuis.update(df['UMLS_CUI'].tolist())

# Sheet 4/5: Mondo_MedGen_CUImismatch_1Mondo
# - summary: We have Mondo terms that have xrefs to multiple MedGen CUIs. Only 1 of them should be left in.
# - recommended action: Remove incorrect xrefs; the ones not identified in the "MedGenCUI" column.
df = book.parse("Mondo_MedGen_CUImismatch_1Mondo")
to_remove = list(df.apply(
lambda row: [x for x in row['medgen_query'].split(' OR ')
if 'MONDO' not in x
and x != row['MedGenCUI']][0], axis=1))
bad_xref_cuis.update(to_remove)

# Sheet 5/5: MondoXrefs_NotinMedGen
# - summary: These terms do not exist in MedGen.
# - recommended action: Remove these xrefs.
df = book.parse("MondoXrefs_NotinMedGen")
bad_xref_cuis.update(df['ref_target'].tolist())

# Convert to strings and sort
bad_xref_cuis: List[str] = sorted([str(x) for x in bad_xref_cuis])

# Output: List of CUIs of bad xrefs
out_df = pd.DataFrame(bad_xref_cuis, columns=['bad_xref_cuis'])
out_df.to_csv(output_file, index=False, header=False)

def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='Collate bad medgen xrefs',
description='From excel file containing workseets about MedGen conflicts, create a text file that lists all '
'the bad CUIs for xrefs that need to be removed from Mondo.')
parser.add_argument(
'-i', '--input-file', default=INPUT_FILE, help='Excel file containing conflitcs sheets')
parser.add_argument(
'-o', '--output-file', default=OUTPUT_FILE, help='Txt file to contain CUIs for bad xrefs')
run(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()
40 changes: 40 additions & 0 deletions src/scripts/bad_medgen_xrefs_grep_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""From a list of bad MedGen xrefs, generate a grep command to remove them from Mondo.
See also:
- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273
"""
from argparse import ArgumentParser


def run(input_file: str, output_file: str, target: str):
"""Create grep command"""
bad_xref_cuis = []
with open(input_file, "r") as file:
for line in file:
bad_xref_cuis.append(line.strip())

output_file2 = output_file.replace('.txt', '-grep-command.sh')
command_string = "grep -v "
for s in bad_xref_cuis:
command_string += f'-e "xref: UMLS:{s}" '
command_string += f"{target} > {target + '.tmp'}\n"
with open(output_file2, "w") as file:
file.write(command_string)

def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='Collate bad medgen xrefs',
description='From excel file containing workseets about MedGen conflicts, create a text file that lists all '
'the bad CUIs for xrefs that need to be removed from Mondo.')
parser.add_argument(
'-i', '--input-file', help='Txt file to contain CUIs for bad xrefs')
parser.add_argument(
'-o', '--output-file', help='Txt file to contain CUIs for bad xrefs')
parser.add_argument(
'-t', '--target', help='Target file to use grep on')
run(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()
87 changes: 87 additions & 0 deletions src/scripts/bad_medgen_xrefs_update_analyze_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""MedGen conflict updates QC analysis
QC analysis for updates to mondo-edit.obo concerning removal of xrefs due to MedGen conflicts.
See also:
- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273
"""
from argparse import ArgumentParser
from collections import Counter
from pathlib import Path

import pandas as pd

SCRIPTS_DIR = Path(__file__).parent
PROJECT_DIR = SCRIPTS_DIR.parent.parent
TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp"
INPUT_FILE = str(TMP_DIR / "mondo-edit.obo.tmp.diff")
OUTPUT_FILE = str(TMP_DIR / "report-qc-medgen-conflicts-update-diff.tsv")


def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
"""Run analysis and generate output"""
report = {}

# Read diff
with open(input_file) as f:
lines = f.readlines()

# Remove hashes, leaving only the edited lines themselves
lines = [l for l in lines if l.startswith('<') or l.startswith('>')]

# Report: How many lines added/removed
report['n_lines_removed'] = len([x for x in lines if x.startswith('<')])
# report['n_lines_added'] = len([x for x in lines if x.startswith('>')]) # 0

# Remove '< ' / '> '
lines = [x[2:] for x in lines]

# Report: How many xrefs / non-xrefs removed
xref_removals = [l for l in lines if l.startswith('xref')]
report['n_xrefs_removed'] = len(xref_removals)
non_xref_removals = [l for l in lines if not l.startswith('xref')]
report['n_non_xrefs_removals'] = len(non_xref_removals)

# Report: Non-xref removals detailes
report['non_xref_removals_content'] = ''
for l in non_xref_removals:
report['non_xref_removals_content'] += l

# Report: Non MedGen xrefs removed that got tagged because UMLS was a source
non_umls_removals = [x for x in xref_removals if not x.startswith('xref: UMLS')]
report['n_xrefs_removed_cuz_umls_was_a_source'] = len(non_umls_removals)

# Report: Number of MedGen xrefs removed
umls_removals = [x for x in xref_removals if x.startswith('xref: UMLS')]
report['n_umls_removals'] = len(umls_removals)

# Report: MedGen xref deletions on multiple Mondo IDs
cui_removal_counts = {}
for l in umls_removals:
parts = l.split(' ')
cui = parts[1]
if cui not in cui_removal_counts:
cui_removal_counts[cui] = 0
cui_removal_counts[cui] += 1
n_removal_frequency = Counter(cui_removal_counts.values())
for k, v in n_removal_frequency.items():
report[f'n_single_cui_removed_from_xrefs_n_times__{k}'] = v

# Saved
df = pd.DataFrame([{'metric_name': k, 'val': v} for k, v in report.items()])
df.to_csv(output_file, sep='\t', index=False)

def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='MedGen conflict updates QC analysis',
description='QC analysis for updates to mondo-edit.obo concerning removal of xrefs due to MedGen conflicts')
parser.add_argument(
'-i', '--input-file', default=INPUT_FILE, help='Diff of mondo-edit.obo and mondo-edit.obo.tmp')
parser.add_argument(
'-o', '--output-file', default=OUTPUT_FILE, help='Analysis results')
run(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()

0 comments on commit d6ec8ae

Please sign in to comment.