-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Addressing issues with our mappings to MedGen. See: monarch-initiative/mondo-ingest#273 - Add: Temp: scripts/bad_medgen_xrefs_collate.py - Add: Temp: scripts/bad_medgen_xrefs_grep_command.py - Add: Temp: src/scripts/bad_medgen_xrefs_update_analyze_diff.py - Update: Temp: mondo.Makefile: Added goals: Several goals to do the update and also analyze it
- Loading branch information
Showing
4 changed files
with
247 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
"""Collate bad medgen xrefs | ||
From excel file containing workseets about MedGen conflicts, create a text file that lists all the bad CUIs for xrefs | ||
that need to be removed from Mondo. | ||
See also: | ||
- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273 | ||
- Megan Kane's source files including readme.txt that goes over each sheet: | ||
https://github.com/monarch-initiative/mondo-ingest/issues/273#issuecomment-1632774012 | ||
Prerequisites: | ||
1. Python package 'openpyxl' to read .xlsx (left out of requirements.txt since this is a temporary script) | ||
""" | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
from typing import List, Set, Union | ||
|
||
import pandas as pd | ||
|
||
SCRIPTS_DIR = Path(__file__).parent | ||
PROJECT_DIR = SCRIPTS_DIR.parent.parent | ||
TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp" | ||
INPUT_FILE = str(TMP_DIR / "July2023_CUIReports_FromMedGentoMondo.xlsx") | ||
OUTPUT_FILE = str(TMP_DIR / "bad-medgen-xrefs.txt") | ||
|
||
|
||
def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE): | ||
"""Run analysis and generate output""" | ||
book = pd.ExcelFile(input_file) | ||
bad_xref_cuis: Set[Union[str, int]] = set() | ||
|
||
# Sheet 1/5: Bad_CNxRefs_withCUI | ||
# - summary: MedGen team has suggested us better mappings to use on these Mondo IDs. | ||
# - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad". | ||
df = book.parse("Bad_CNxRefs_withCUI") | ||
bad_xref_cuis.update(df['mondo_xref_bad'].tolist()) | ||
|
||
# Sheet 2/5: WrongCUIxref_withCurrCUI | ||
# - summary: MedGen team has suggested us better mappings to use on these Mondo IDs. | ||
# - recommended action: For Mondo terms identified in "mondo_id", remove the xrefs identified in "mondo_xref_bad". | ||
df = book.parse("WrongCUIxref_withCurrCUI") | ||
bad_xref_cuis.update(df['mondo_xref_bad'].tolist()) | ||
|
||
# Sheet 3/5: 1MondoID_to1CUI | ||
# - summary: MedGen has identified better Mondo IDs to map to these CUIs. | ||
# - recommended action: For terms in the "mondo_id(Mondo)" column, remove xrefs identified in the "UMLS_CUI" column. | ||
df = book.parse(">1MondoID_to1CUI") | ||
bad_xref_cuis.update(df['UMLS_CUI'].tolist()) | ||
|
||
# Sheet 4/5: Mondo_MedGen_CUImismatch_1Mondo | ||
# - summary: We have Mondo terms that have xrefs to multiple MedGen CUIs. Only 1 of them should be left in. | ||
# - recommended action: Remove incorrect xrefs; the ones not identified in the "MedGenCUI" column. | ||
df = book.parse("Mondo_MedGen_CUImismatch_1Mondo") | ||
to_remove = list(df.apply( | ||
lambda row: [x for x in row['medgen_query'].split(' OR ') | ||
if 'MONDO' not in x | ||
and x != row['MedGenCUI']][0], axis=1)) | ||
bad_xref_cuis.update(to_remove) | ||
|
||
# Sheet 5/5: MondoXrefs_NotinMedGen | ||
# - summary: These terms do not exist in MedGen. | ||
# - recommended action: Remove these xrefs. | ||
df = book.parse("MondoXrefs_NotinMedGen") | ||
bad_xref_cuis.update(df['ref_target'].tolist()) | ||
|
||
# Convert to strings and sort | ||
bad_xref_cuis: List[str] = sorted([str(x) for x in bad_xref_cuis]) | ||
|
||
# Output: List of CUIs of bad xrefs | ||
out_df = pd.DataFrame(bad_xref_cuis, columns=['bad_xref_cuis']) | ||
out_df.to_csv(output_file, index=False, header=False) | ||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = ArgumentParser( | ||
prog='Collate bad medgen xrefs', | ||
description='From excel file containing workseets about MedGen conflicts, create a text file that lists all ' | ||
'the bad CUIs for xrefs that need to be removed from Mondo.') | ||
parser.add_argument( | ||
'-i', '--input-file', default=INPUT_FILE, help='Excel file containing conflitcs sheets') | ||
parser.add_argument( | ||
'-o', '--output-file', default=OUTPUT_FILE, help='Txt file to contain CUIs for bad xrefs') | ||
run(**vars(parser.parse_args())) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
"""From a list of bad MedGen xrefs, generate a grep command to remove them from Mondo. | ||
See also: | ||
- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273 | ||
""" | ||
from argparse import ArgumentParser | ||
|
||
|
||
def run(input_file: str, output_file: str, target: str): | ||
"""Create grep command""" | ||
bad_xref_cuis = [] | ||
with open(input_file, "r") as file: | ||
for line in file: | ||
bad_xref_cuis.append(line.strip()) | ||
|
||
output_file2 = output_file.replace('.txt', '-grep-command.sh') | ||
command_string = "grep -v " | ||
for s in bad_xref_cuis: | ||
command_string += f"-e {s} " | ||
command_string += f"{target} > {target + '.tmp'}\n" | ||
with open(output_file2, "w") as file: | ||
file.write(command_string) | ||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = ArgumentParser( | ||
prog='Collate bad medgen xrefs', | ||
description='From excel file containing workseets about MedGen conflicts, create a text file that lists all ' | ||
'the bad CUIs for xrefs that need to be removed from Mondo.') | ||
parser.add_argument( | ||
'-i', '--input-file', help='Txt file to contain CUIs for bad xrefs') | ||
parser.add_argument( | ||
'-o', '--output-file', help='Txt file to contain CUIs for bad xrefs') | ||
parser.add_argument( | ||
'-t', '--target', help='Target file to use grep on') | ||
run(**vars(parser.parse_args())) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
"""MedGen conflict updates QC analysis | ||
QC analysis for updates to mondo-edit.obo concerning removal of xrefs due to MedGen conflicts. | ||
See also: | ||
- GH issue: https://github.com/monarch-initiative/mondo-ingest/issues/273 | ||
""" | ||
from argparse import ArgumentParser | ||
from collections import Counter | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
||
SCRIPTS_DIR = Path(__file__).parent | ||
PROJECT_DIR = SCRIPTS_DIR.parent.parent | ||
TMP_DIR = PROJECT_DIR / "src" / "ontology" / "tmp" | ||
INPUT_FILE = str(TMP_DIR / "mondo-edit.obo.tmp.diff") | ||
OUTPUT_FILE = str(TMP_DIR / "report-qc-medgen-conflicts-update-diff.tsv") | ||
|
||
|
||
def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE): | ||
"""Run analysis and generate output""" | ||
report = {} | ||
|
||
# Read diff | ||
with open(input_file) as f: | ||
lines = f.readlines() | ||
|
||
# Remove hashes, leaving only the edited lines themselves | ||
lines = [l for l in lines if l.startswith('<') or l.startswith('>')] | ||
|
||
# Report: How many lines added/removed | ||
report['n_lines_removed'] = len([x for x in lines if x.startswith('<')]) | ||
# report['n_lines_added'] = len([x for x in lines if x.startswith('>')]) # 0 | ||
|
||
# Remove '< ' / '> ' | ||
lines = [x[2:] for x in lines] | ||
|
||
# Report: How many xrefs / non-xrefs removed | ||
xref_removals = [l for l in lines if l.startswith('xref')] | ||
report['n_xrefs_removed'] = len(xref_removals) | ||
non_xref_removals = [l for l in lines if not l.startswith('xref')] | ||
report['n_non_xrefs_removals'] = len(non_xref_removals) | ||
|
||
# Report: Non-xref removals detailes | ||
report['non_xref_removals_content'] = '' | ||
for l in non_xref_removals: | ||
report['non_xref_removals_content'] += l | ||
|
||
# Report: Non MedGen xrefs removed that got tagged because UMLS was a source | ||
non_umls_removals = [x for x in xref_removals if not x.startswith('xref: UMLS')] | ||
report['n_xrefs_removed_cuz_umls_was_a_source'] = len(non_umls_removals) | ||
|
||
# Report: Number of MedGen xrefs removed | ||
umls_removals = [x for x in xref_removals if x.startswith('xref: UMLS')] | ||
report['n_umls_removals'] = len(umls_removals) | ||
|
||
# Report: MedGen xref deletions on multiple Mondo IDs | ||
cui_removal_counts = {} | ||
for l in umls_removals: | ||
parts = l.split(' ') | ||
cui = parts[1] | ||
if cui not in cui_removal_counts: | ||
cui_removal_counts[cui] = 0 | ||
cui_removal_counts[cui] += 1 | ||
n_removal_frequency = Counter(cui_removal_counts.values()) | ||
for k, v in n_removal_frequency.items(): | ||
report[f'n_single_cui_removed_from_xrefs_n_times__{k}'] = v | ||
|
||
# Saved | ||
df = pd.DataFrame([{'metric_name': k, 'val': v} for k, v in report.items()]) | ||
df.to_csv(output_file, sep='\t', index=False) | ||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = ArgumentParser( | ||
prog='MedGen conflict updates QC analysis', | ||
description='QC analysis for updates to mondo-edit.obo concerning removal of xrefs due to MedGen conflicts') | ||
parser.add_argument( | ||
'-i', '--input-file', default=INPUT_FILE, help='Diff of mondo-edit.obo and mondo-edit.obo.tmp') | ||
parser.add_argument( | ||
'-o', '--output-file', default=OUTPUT_FILE, help='Analysis results') | ||
run(**vars(parser.parse_args())) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |