Skip to content

Commit

Permalink
Create RD subset pipeline (#8095)
Browse files Browse the repository at this point in the history
* Create RD subset pipeline

* Remove redundant subsets/mondo-rare.kgx.tsv

This has already been fixed on master branch

* change dir ref from tmp to $(TMPDIR) for Rare Subset and EMC goals

* change tmp dir ref to $(TMPDIR)

* Fixing NORD URL

* re-run update-rare-disease-subset goal after updating qc checks in mondo-ingest

* Refactor pipeline to allow simulating EMC runs

* Fix the GARD goal

This is now simply using the usual setup, with a minor addition which rewrites obsolete GARD mappings to the correct format.

* Return mondo-edit.obo back to master state

* Update all EMC incl. rare

* Add all rare subsets to Animal rare QC check

* Update Animal disease queries to have the subset as the value instead of property

* Update mondo-edit.obo

* Update mondo-edit.obo

---------

Co-authored-by: Trish Whetzel <[email protected]>
  • Loading branch information
matentzn and twhetzel authored Aug 24, 2024
1 parent ed35e5b commit 163b417
Show file tree
Hide file tree
Showing 7 changed files with 341 additions and 1,347 deletions.
1,374 changes: 108 additions & 1,266 deletions src/ontology/mondo-edit.obo

Large diffs are not rendered by default.

224 changes: 160 additions & 64 deletions src/ontology/mondo.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -266,79 +266,197 @@ $(TEMPLATES_DIR)/ROBOT_addMedGen_fromConflictResolution.tsv: tmp/July2023_CUIRep
$(TEMPLATES_DIR)/ROBOT_addMedGen_fromIngest.tsv:
wget "https://github.com/monarch-initiative/medgen/releases/latest/download/medgen-xrefs.robot.template.tsv" -O $@


######################################################
##### Mondo External Content Pipeline ################
##### Mondo Ingest Update Pipelines ##################
######################################################

# 1. Rare disease pipeline
# 2. Externally managed content pipeline

# CHANGE THIS TO THE MAIN BRANCH BEFOR MERGING!!!
MONDO_INGEST_LOCATION=https://raw.githubusercontent.com/monarch-initiative/mondo-ingest/externalclingenmedgenefo/src/ontology/external
MONDO_INGEST_EXTERNAL_LOCATION=https://raw.githubusercontent.com/monarch-initiative/mondo-ingest/externalclingenmedgenefo/src/ontology/external

DOWNLOAD_EXTERNAL=true

# All the content for this pipeline is pulled from the mondo-ingest repo

tmp/external/processed-%.robot.owl:
mkdir -p tmp/external
if [ $(DOWNLOAD_EXTERNAL) = true ]; then wget "$(MONDO_INGEST_EXTERNAL_LOCATION)/processed-$*.robot.owl" -O $@; fi

update-rare-disease-subset:
$(MAKE) subset-metrics -B && cp $(TMPDIR)/subset-metrics.tsv $(TMPDIR)/subset-metrics-before.tsv
$(MAKE) update-orphanet-rare -B
$(MAKE) update-gard -B
$(MAKE) update-nord -B
$(MAKE) update-inferred-subset -B
$(MAKE) update-rare-subset -B
$(MAKE) subset-metrics -B && cp $(TMPDIR)/subset-metrics.tsv $(TMPDIR)/subset-metrics-after.tsv
@echo "Subset metrics before..."
cat $(TMPDIR)/subset-metrics-before.tsv
@echo "Subset metrics after..."
cat $(TMPDIR)/subset-metrics-after.tsv

# This is the main pipeline to update all external content
update-external-content:
$(MAKE) subset-metrics -B && cp $(TMPDIR)/subset-metrics.tsv $(TMPDIR)/subset-metrics-before.tsv
$(MAKE) update-efo-subset -B
$(MAKE) update-clingen -B
$(MAKE) update-ordo-subsets -B
$(MAKE) update-nando -B
$(MAKE) update-medgen -B
$(MAKE) subset-metrics -B && cp $(TMPDIR)/subset-metrics.tsv $(TMPDIR)/subset-metrics-after.tsv
@echo "Subset metrics before..."
cat $(TMPDIR)/subset-metrics-before.tsv
@echo "Subset metrics after..."
cat $(TMPDIR)/subset-metrics-after.tsv

# This is the main pipeline to update all external content
update-external-content-incl-rare:
$(MAKE) subset-metrics -B && cp $(TMPDIR)/subset-metrics.tsv $(TMPDIR)/subset-metrics-before.tsv
$(MAKE) update-efo-subset -B
$(MAKE) update-clingen -B
$(MAKE) update-ordo-subsets -B
$(MAKE) update-nando -B
$(MAKE) update-medgen -B
$(MAKE) update-orphanet-rare -B
$(MAKE) update-gard -B
$(MAKE) update-nord -B
$(MAKE) update-inferred-subset -B
$(MAKE) update-rare-subset -B
$(MAKE) subset-metrics -B && cp $(TMPDIR)/subset-metrics.tsv $(TMPDIR)/subset-metrics-after.tsv
@echo "Subset metrics before..."
cat $(TMPDIR)/subset-metrics-before.tsv
@echo "Subset metrics after..."
cat $(TMPDIR)/subset-metrics-after.tsv

######################################################
##### Mondo Rare Disease Pipeline ####################
######################################################

##### Orphanet Rare ################

$(TMPDIR)/orphanet-rare-subset.owl: $(SRC)
$(ROBOT) merge -i $(SRC) reason \
query --format ttl --query ../sparql/construct/construct-orphanet-rare-subset.sparql $@

.PHONY: update-orphanet-rare
update-orphanet-rare:
$(MAKE) $(TMPDIR)/orphanet-rare-subset.owl
grep -vE '^(subset: orphanet_rare)' $(SRC) > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/orphanet-rare-subset.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)

##### GARD #########################

# The complex part here is that we need to dynamically update the MONDO source code, i.e.
# MONDO:equivalentTo and MONDO:obsoleteEquivalentTo.

.PHONY: update-gard
update-gard:
$(MAKE) $(TMPDIR)/external/processed-gard.robot.owl
grep -vE '^(xref: GARD:|subset: gard_rare)' $(SRC) > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/external/processed-gard.robot.owl --collapse-import-closure false \
query --update ../sparql/update/insert-gard-obsoletion-status.ru \
convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC) && make deprecated_annotation_merging && make NORM && mv NORM $(SRC)

##### NORD #########################

.PHONY: update-nord
update-nord:
make $(TMPDIR)/external/processed-nord.robot.owl -B
grep -vE '^(xref: NORD:|subset: nord_rare)' $(SRC) > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/external/processed-nord.robot.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)

##### Inferred #####################

# The inferred subset depends on the other ones, so we need to first remove the old subsets
# Then add the gard, nord and orphanet subsets back in
$(TMPDIR)/inferred-rare-subset.owl: $(SRC)
$(ROBOT) merge -i $(SRC) \
reason \
query --format ttl --query ../sparql/construct/construct-inferred-rare-subset.sparql $@

update-inferred-subset:
$(MAKE) $(TMPDIR)/inferred-rare-subset.owl
grep -vE '^(subset: inferred_rare)' $(SRC) > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/inferred-rare-subset.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)

##### RARE #########################

tmp/rare-subset.owl: $(SRC)
$(ROBOT) merge -i $(SRC) \
query --format ttl --query ../sparql/construct/construct-rare-subset.sparql $@

.PHONY: update-rare-subset
update-rare-subset:
$(MAKE) $(TMPDIR)/rare-subset.owl
grep -vE '^(subset: rare)$$' $(SRC) > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/rare-subset.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)


######################################################
##### Mondo External Content Pipeline ################
######################################################

####################################
##### Orphanet #####################
####################################

tmp/ordo-subsets.robot.owl:
wget "$(MONDO_INGEST_LOCATION)/processed-ordo-subsets.robot.owl" -O $@

.PHONY: update-ordo-subsets
update-ordo-subsets:
$(MAKE) tmp/ordo-subsets.robot.owl -B
grep -vE '^(subset: ordo_group_of_disorders)' $(SRC) | grep -vE '^(subset: ordo_disorder)' | grep -vE '^(subset: ordo_subtype_of_a_disorder)' > tmp/mondo-edit.tmp || true
mv tmp/mondo-edit.tmp $(SRC)
$(ROBOT) merge -i $(SRC) -i tmp/ordo-subsets.robot.owl --collapse-import-closure false convert -f obo --check false -o tmp/mondo-edit.tmp
mv tmp/mondo-edit.tmp $(SRC) && make NORM && mv NORM $(SRC)
$(MAKE) $(TMPDIR)/external/processed-ordo-subsets.robot.owl -B
grep -vE '^(subset: ordo_group_of_disorders)' $(SRC) | grep -vE '^(subset: ordo_disorder)' | grep -vE '^(subset: ordo_subtype_of_a_disorder)' > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp $(SRC)
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/external/processed-ordo-subsets.robot.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)

####################################
##### NANDO #########################
####################################

tmp/nando.template.owl:
wget $(MONDO_INGEST_LOCATION)/processed-nando-mappings.robot.owl -O $@

.PHONY: update-nando
update-nando:
$(MAKE) tmp/nando.template.owl -B
grep -vE '^(xref: NANDO:)' $(SRC) > tmp/mondo-edit.tmp || true
mv tmp/mondo-edit.tmp $(SRC)
$(ROBOT) merge -i $(SRC) -i tmp/nando.template.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
$(MAKE) $(TMPDIR)/external/processed-nando-mappings.robot.owl -B
grep -vE '^(xref: NANDO:)' $(SRC) > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp $(SRC)
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/external/processed-nando-mappings.robot.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)

####################################
##### CLINGEN ######################
####################################

tmp/clingen.template.owl:
wget "$(MONDO_INGEST_LOCATION)/processed-mondo-clingen.robot.owl" -O $@

.PHONY: update-clingen
update-clingen:
$(MAKE) tmp/clingen.template.owl
grep -vE '^(relationship: curated_content_resource https://search.clinicalgenome.org|subset: clingen)' $(SRC) > tmp/mondo-edit.tmp
$(MAKE) $(TMPDIR)/external/processed-mondo-clingen.robot.owl
grep -vE '^(relationship: curated_content_resource https://search.clinicalgenome.org|subset: clingen)' $(SRC) > $(TMPDIR)/mondo-edit.tmp
#CAREFUL, this needs to be uncommented when we just to include CLINGEN LABELs
#sed -i 's/EXACT CLINGEN_LABEL/EXACT/g' tmp/mondo-edit.tmp || true
mv tmp/mondo-edit.tmp $(SRC)
$(ROBOT) merge -i $(SRC) -i tmp/clingen.template.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
#sed -i 's/EXACT CLINGEN_LABEL/EXACT/g' $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp $(SRC)
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/external/processed-mondo-clingen.robot.owl --collapse-import-closure false convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)

####################################
##### EFO ##########################
####################################

tmp/mondo-efo.template.owl:
wget "$(MONDO_INGEST_LOCATION)/processed-mondo-efo.robot.owl" -O $@

tmp/mondo-otar-subset.template.owl:
wget "$(MONDO_INGEST_LOCATION)/processed-mondo-otar-subset.robot.owl" -O $@

tmp/efo-proxy-merges.template.owl:
wget "$(MONDO_INGEST_LOCATION)/efo-proxy-merges.robot.owl" -O $@

.PHONY: update-efo-subset
update-efo-subset:
$(MAKE) tmp/mondo-otar-subset.template.owl tmp/mondo-efo.template.owl tmp/efo-proxy-merges.template.owl
$(MAKE) $(TMPDIR)/external/processed-mondo-otar-subset.robot.owl $(TMPDIR)/external/processed-mondo-efo.robot.owl $(TMPDIR)/external/processed-efo-proxy-merges.robot.owl
grep -vE '^(xref: EFO:|subset: otar)' $(SRC) > tmp/mondo-edit.tmp || true
mv tmp/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i tmp/mondo-otar-subset.template.owl -i tmp/mondo-efo.template.owl -i tmp/efo-proxy-merges.template.owl --collapse-import-closure false \
mv $(TMPDIR)/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/external/processed-mondo-otar-subset.robot.owl -i $(TMPDIR)/external/processed-mondo-efo.robot.owl -i $(TMPDIR)/external/processed-efo-proxy-merges.robot.owl --collapse-import-closure false \
query --use-graphs false --update ../sparql/update/update-equivalent-obsolete.ru \
convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)
Expand All @@ -347,42 +465,20 @@ update-efo-subset:
##### MedGen #######################
####################################

# CHANGE THIS TO THE MAIN BRANCH BEFOR MERGING!!!

tmp/mondo-medgen.template.owl:
wget "$(MONDO_INGEST_LOCATION)/processed-mondo-medgen.robot.owl" -O $@

.PHONY: update-medgen
update-medgen:
$(MAKE) tmp/mondo-medgen.template.owl
grep -vE '^(xref: UMLS:|xref: MEDGEN:|subset: medgen)' $(SRC) > tmp/mondo-edit.tmp || true
mv tmp/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i tmp/mondo-medgen.template.owl --collapse-import-closure false \
$(MAKE) $(TMPDIR)/external/processed-mondo-medgen.robot.owl
grep -vE '^(xref: UMLS:|xref: MEDGEN:|subset: medgen)' $(SRC) > $(TMPDIR)/mondo-edit.tmp || true
mv $(TMPDIR)/mondo-edit.tmp mondo-edit.obo
$(ROBOT) merge -i $(SRC) -i $(TMPDIR)/external/processed-mondo-medgen.robot.owl --collapse-import-closure false \
query --use-graphs false --update ../sparql/update/update-equivalent-obsolete.ru \
convert -f obo --check false -o $(SRC).obo
mv $(SRC).obo $(SRC) && make NORM && mv NORM $(SRC)

##########################################
###### Update all external content #######
##########################################

# This is the main pipeline to update all external content
update-external-content:
$(MAKE) subset-metrics -B && cp tmp/subset-metrics.tsv tmp/subset-metrics-before.tsv
$(MAKE) update-efo-subset -B
$(MAKE) update-clingen -B
$(MAKE) update-ordo-subsets -B
$(MAKE) update-nando -B
$(MAKE) update-medgen -B
$(MAKE) subset-metrics -B && cp tmp/subset-metrics.tsv tmp/subset-metrics-after.tsv
@echo "Subset metrics before..."
cat tmp/subset-metrics-before.tsv
@echo "Subset metrics after..."
cat tmp/subset-metrics-after.tsv

.PHONY: subset-metrics
subset-metrics:
$(ROBOT) query -f tsv -i $(SRC) --query $(SPARQLDIR)/reports/count-subsets.sparql tmp/$@.tsv
$(ROBOT) query -f tsv -i $(SRC) --query $(SPARQLDIR)/reports/count-subsets.sparql $(TMPDIR)/$@.tsv


#############################################
Expand Down
28 changes: 19 additions & 9 deletions src/sparql/construct/construct-orphanet-rare-subset.sparql
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,44 @@ prefix oio: <http://www.geneontology.org/formats/oboInOwl#>
prefix def: <http://purl.obolibrary.org/obo/IAO_0000115>
prefix owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

#classes with the axiom "'bearer of' some rare" AND/OR "has subset :‘gard_rare’" AND limited to disease branch (ie not including the disease susceptibility branch
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>

CONSTRUCT {
?cls <http://www.geneontology.org/formats/oboInOwl#inSubset> <http://purl.obolibrary.org/obo/mondo#orphanet_rare> .
[] rdf:type owl:Axiom ;
owl:annotatedSource ?cls ;
owl:annotatedProperty <http://www.geneontology.org/formats/oboInOwl#inSubset> ;
owl:annotatedTarget <http://purl.obolibrary.org/obo/mondo#orphanet_rare> ;
oboInOwl:source ?xref .
}

WHERE
{
?cls a owl:Class .

{
# Match the XREF and the fact this term belongs to the ordo_disorder subset
?cls <http://www.geneontology.org/formats/oboInOwl#hasDbXref> ?xref .

# Only ordo_disorder is included as per request by Orphanet: these are the diseases they truly consider rare
?cls <http://www.geneontology.org/formats/oboInOwl#inSubset> <http://purl.obolibrary.org/obo/mondo#ordo_disorder> .

# For safety reasons, we only consider the case that the evidence of the subset corresponds exactly to the xref
# This is probably redundant, but it is possible that a class has two ORDO xrefs, one exact, one not
# and we only want to record the exact xref as evidence for the rare subset
?a <http://www.w3.org/2002/07/owl#annotatedSource> ?cls .
?a <http://www.w3.org/2002/07/owl#annotatedProperty> <http://www.geneontology.org/formats/oboInOwl#hasDbXref> .
?a <http://www.w3.org/2002/07/owl#annotatedTarget> ?xref .
?a <http://www.geneontology.org/formats/oboInOwl#source> ?source .
?a <http://www.w3.org/2002/07/owl#annotatedProperty> <http://www.geneontology.org/formats/oboInOwl#inSubset> .
?a <http://www.w3.org/2002/07/owl#annotatedTarget> <http://purl.obolibrary.org/obo/mondo#ordo_disorder> .
?a <http://www.geneontology.org/formats/oboInOwl#source> ?xref .

}

FILTER (?cls NOT IN (MONDO:0000001))
FILTER (?source="MONDO:equivalentTo")
FILTER (strStarts(?xref,"Orphanet:"))

FILTER NOT EXISTS {
?cls owl:deprecated "true"^^xsd:boolean
}
FILTER( !isBlank(?cls) && STRSTARTS(str(?cls), "http://purl.obolibrary.org/obo/MONDO_"))
FILTER( STR(?xref) != "Orphanet:377788")
# See https://github.com/monarch-initiative/mondo/issues/7704, some specific mappings should be excluded
}

2 changes: 1 addition & 1 deletion src/sparql/construct/construct-rare-subset.sparql
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ prefix def: <http://purl.obolibrary.org/obo/IAO_0000115>
prefix owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

INSERT {
CONSTRUCT {
?cls <http://www.geneontology.org/formats/oboInOwl#inSubset> <http://purl.obolibrary.org/obo/mondo#rare> .
}

Expand Down
26 changes: 19 additions & 7 deletions src/sparql/qc/mondo/qc-animal-disease-rare.sparql
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,27 @@ prefix mondo: <http://purl.obolibrary.org/obo/mondo#>

SELECT DISTINCT ?entity ?property ?value WHERE
{
VALUES ?property { <http://purl.obolibrary.org/obo/mondo#rare> }
?entity <http://www.geneontology.org/formats/oboInOwl#inSubset> ?property .
VALUES ?value {
<http://purl.obolibrary.org/obo/mondo#rare>
<http://purl.obolibrary.org/obo/mondo#nord_rare>
<http://purl.obolibrary.org/obo/mondo#orphanet_rare>
<http://purl.obolibrary.org/obo/mondo#gard_rare>
<http://purl.obolibrary.org/obo/mondo#inferred_rare>
<http://purl.obolibrary.org/obo/mondo#mondo_rare>
}
VALUES ?property {
<http://www.geneontology.org/formats/oboInOwl#inSubset>
}
?entity ?property ?value .
?entity rdfs:subClassOf* <http://purl.obolibrary.org/obo/MONDO_0005583>
FILTER NOT EXISTS {

FILTER NOT EXISTS {
?entity owl:deprecated "true"^^xsd:boolean
}
FILTER NOT EXISTS {
?entity mondo:excluded_from_qc_check mondoSparqlQcMondo:qc-animal-disease-rare.sparql .
}

FILTER NOT EXISTS {
?entity mondo:excluded_from_qc_check mondoSparqlQcMondo:qc-animal-disease-rare.sparql .
}

FILTER( !isBlank(?entity) && STRSTARTS(str(?entity), "http://purl.obolibrary.org/obo/MONDO_"))
BIND("Animal disease in Rare subset" as ?value)
}
13 changes: 13 additions & 0 deletions src/sparql/reports/excluded-synonyms.sparql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?mondo_id ?property ?synonym WHERE {
?axiom rdf:type owl:Axiom .
?axiom owl:annotatedSource ?mondo_id .
?axiom owl:annotatedProperty ?property .
?axiom owl:annotatedTarget ?synonym .
?axiom oboInOwl:hasSynonymType <http://purl.obolibrary.org/obo/mondo#EXCLUDE> .

FILTER(STRSTARTS(STR(?mondo_id), "http://purl.obolibrary.org/obo/MONDO_"))
}
Loading

0 comments on commit 163b417

Please sign in to comment.