Skip to content

Commit

Permalink
Add streamlined method for getting curie names #2336
Browse files Browse the repository at this point in the history
  • Loading branch information
amykglen committed Aug 22, 2024
1 parent 4e9e95c commit 9055f97
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 37 deletions.
29 changes: 2 additions & 27 deletions code/ARAX/ARAXQuery/Expand/expand_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,33 +409,8 @@ def get_preferred_categories(curie: Union[str, List[str]], log: ARAXResponse) ->
def get_curie_names(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, str]:
curies = convert_to_list(curie)
synonymizer = NodeSynonymizer()
log.debug(f"Looking up names for {len(curies)} input curies using NodeSynonymizer")
synonymizer_info = synonymizer.get_normalizer_results(curies)
curie_to_name_map = dict()
if synonymizer_info:
recognized_input_curies = {input_curie for input_curie in synonymizer_info if synonymizer_info.get(input_curie)}
unrecognized_curies = set(curies).difference(recognized_input_curies)
if unrecognized_curies:
log.warning(f"NodeSynonymizer did not recognize: {unrecognized_curies}")
input_curies_without_matching_node = set()
for input_curie in recognized_input_curies:
equivalent_nodes = synonymizer_info[input_curie]["nodes"]
# Find the 'node' in the synonymizer corresponding to this curie
input_curie_nodes = [node for node in equivalent_nodes if node["identifier"] == input_curie]
if not input_curie_nodes:
# Try looking for slight variation (KG2 vs. SRI discrepancy): "KEGG:C02700" vs. "KEGG.COMPOUND:C02700"
input_curie_stripped = input_curie.replace(".COMPOUND", "")
input_curie_nodes = [node for node in equivalent_nodes if node["identifier"] == input_curie_stripped]
# Record the name for this input curie
if input_curie_nodes:
curie_to_name_map[input_curie] = input_curie_nodes[0].get("label")
else:
input_curies_without_matching_node.add(input_curie)
if input_curies_without_matching_node:
log.warning(f"No matching nodes found in NodeSynonymizer for these input curies: "
f"{input_curies_without_matching_node}. Cannot determine their specific names.")
else:
log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue")
log.debug(f"Looking up names for {len(curies)} input curies using NodeSynonymizer.get_curie_names()")
curie_to_name_map = synonymizer.get_curie_names(curies)
return curie_to_name_map


Expand Down
101 changes: 93 additions & 8 deletions code/ARAX/NodeSynonymizer/node_synonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sqlite3
import string
import sys
import time
from collections import defaultdict
from typing import Optional, Union, List, Set, Dict, Tuple

Expand Down Expand Up @@ -56,7 +57,9 @@ def __del__(self):

def get_canonical_curies(self, curies: Optional[Union[str, Set[str], List[str]]] = None,
names: Optional[Union[str, Set[str], List[str]]] = None,
return_all_categories: bool = False) -> dict:
return_all_categories: bool = False,
debug: bool = False) -> dict:
start = time.time()

# Convert any input values to Set format
curies_set = self._convert_to_set_format(curies)
Expand Down Expand Up @@ -144,11 +147,15 @@ def get_canonical_curies(self, curies: Optional[Union[str, Set[str], List[str]]]
for unrecognized_value in unrecognized_input_values:
results_dict[unrecognized_value] = None

if debug:
print(f"Took {round(time.time() - start, 5)} seconds")
return results_dict

def get_equivalent_nodes(self, curies: Optional[Union[str, Set[str], List[str]]] = None,
names: Optional[Union[str, Set[str], List[str]]] = None,
include_unrecognized_entities: bool = True) -> dict:
include_unrecognized_entities: bool = True,
debug: bool = False) -> dict:
start = time.time()

# Convert any input values to Set format
curies_set = self._convert_to_set_format(curies)
Expand Down Expand Up @@ -210,9 +217,77 @@ def get_equivalent_nodes(self, curies: Optional[Union[str, Set[str], List[str]]]
for unrecognized_curie in unrecognized_curies:
results_dict[unrecognized_curie] = None

if debug:
print(f"Took {round(time.time() - start, 5)} seconds")
return results_dict

def get_normalizer_results(self, entities: Optional[Union[str, Set[str], List[str]]]) -> dict:
def get_preferred_names(self, curies: Union[str, Set[str], List[str]], debug: bool = False) -> dict:
"""
Returns preferred names for input curies - i.e., the name of the curie's canonical identifier.
"""
start = time.time()

# Convert any input values to Set format
curies_set = self._convert_to_set_format(curies)
results_dict = dict()

if curies_set:
# First transform curies so that their prefixes are entirely uppercase
curies_to_capitalized_curies, capitalized_curies = self._map_to_capitalized_curies(curies_set)

# Query the synonymizer sqlite database for these identifiers (in batches, if necessary)
sql_query_template = f"""
SELECT N.id_simplified, C.name
FROM nodes as N
INNER JOIN clusters as C on C.cluster_id == N.cluster_id
WHERE N.id_simplified in ('{self.placeholder_lookup_values_str}')"""
matching_rows = self._run_sql_query_in_batches(sql_query_template, capitalized_curies)

# Transform the results into the proper response format
results_dict_capitalized = {row[0]: row[1] for row in matching_rows}
results_dict = {input_curie: results_dict_capitalized[capitalized_curie]
for input_curie, capitalized_curie in curies_to_capitalized_curies.items()
if capitalized_curie in results_dict_capitalized}

if debug:
print(f"Took {round(time.time() - start, 5)} seconds")
return results_dict

def get_curie_names(self, curies: Union[str, Set[str], List[str]], debug: bool = False) -> dict:
"""
Returns NON-preferred names for input curies; i.e., the curie's direct name, not the name of its canonical
identifier.
"""
start = time.time()

# Convert any input values to Set format
curies_set = self._convert_to_set_format(curies)
results_dict = dict()

if curies_set:
# First transform curies so that their prefixes are entirely uppercase
curies_to_capitalized_curies, capitalized_curies = self._map_to_capitalized_curies(curies_set)

# Query the synonymizer sqlite database for these identifiers (in batches, if necessary)
sql_query_template = f"""
SELECT N.id_simplified, N.name
FROM nodes as N
WHERE N.id_simplified in ('{self.placeholder_lookup_values_str}')"""
matching_rows = self._run_sql_query_in_batches(sql_query_template, capitalized_curies)

# Transform the results into the proper response format
results_dict_capitalized = {row[0]: row[1] for row in matching_rows}
results_dict = {input_curie: results_dict_capitalized[capitalized_curie]
for input_curie, capitalized_curie in curies_to_capitalized_curies.items()
if capitalized_curie in results_dict_capitalized}

if debug:
print(f"Took {round(time.time() - start, 5)} seconds")
return results_dict

def get_normalizer_results(self, entities: Optional[Union[str, Set[str], List[str]]],
debug: bool = False) -> dict:
start = time.time()

# First handle any special input from /entity endpoint
output_format = None
Expand Down Expand Up @@ -298,6 +373,8 @@ def get_normalizer_results(self, entities: Optional[Union[str, Set[str], List[st
if normalizer_info:
normalizer_info["knowledge_graph"] = self._get_cluster_graph(normalizer_info)

if debug:
print(f"Took {round(time.time() - start, 5)} seconds")
return results_dict

# ---------------------------------------- EXTERNAL DEBUG METHODS --------------------------------------------- #
Expand Down Expand Up @@ -574,27 +651,35 @@ def main():
arg_parser.add_argument("-c", "--canonical", dest="canonical", action="store_true")
arg_parser.add_argument("-e", "--equivalent", dest="equivalent", action="store_true")
arg_parser.add_argument("-n", "--normalizer", dest="normalizer", action="store_true")
arg_parser.add_argument("-l", "--names", dest="names", action="store_true")
arg_parser.add_argument("-p", "--preferrednames", dest="preferred_names", action="store_true")
# Add a couple other data viewing options (tabular and TRAPI cluster graph format)
arg_parser.add_argument("-t", "--table", dest="table", action="store_true")
arg_parser.add_argument("-g", "--graph", dest="graph", action="store_true")
args = arg_parser.parse_args()

synonymizer = NodeSynonymizer()
if args.canonical:
results = synonymizer.get_canonical_curies(curies=args.curie_or_name)
results = synonymizer.get_canonical_curies(curies=args.curie_or_name, debug=True)
if not results[args.curie_or_name]:
results = synonymizer.get_canonical_curies(names=args.curie_or_name)
print(json.dumps(results, indent=2))
if args.equivalent:
results = synonymizer.get_equivalent_nodes(curies=args.curie_or_name)
results = synonymizer.get_equivalent_nodes(curies=args.curie_or_name, debug=True)
if not results[args.curie_or_name]:
results = synonymizer.get_equivalent_nodes(names=args.curie_or_name)
results = synonymizer.get_equivalent_nodes(names=args.curie_or_name, debug=True)
print(json.dumps(results, indent=2))
if args.normalizer:
results = synonymizer.get_normalizer_results(entities=args.curie_or_name)
results = synonymizer.get_normalizer_results(entities=args.curie_or_name, debug=True)
print(json.dumps(results, indent=2))
if args.names:
results = synonymizer.get_curie_names(curies=args.curie_or_name, debug=True)
print(json.dumps(results, indent=2))
if args.preferred_names:
results = synonymizer.get_preferred_names(curies=args.curie_or_name, debug=True)
print(json.dumps(results, indent=2))
# Default to printing the tabular view of the cluster if nothing else was specified
if args.table or (not args.canonical and not args.equivalent and not args.normalizer and not args.graph):
if args.table or not (args.canonical or args.equivalent or args.normalizer or args.names or args.preferred_names or args.graph):
synonymizer.print_cluster_table(args.curie_or_name)


Expand Down
26 changes: 24 additions & 2 deletions code/ARAX/test/test_ARAX_synonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
PARKINSONS_CURIE_2 = "MONDO:0005180"
IBUPROFEN_CURIE = "DRUGBANK:DB01050"
ACETAMINOPHEN_CURIE = "CHEMBL.COMPOUND:CHEMBL112"
ACETAMINOPHEN_CURIE_2 = "DRUGBANK:DB00316"
ACETAMINOPHEN_CURIE_2 = "CHEBI:46195"
SNCA_CURIE = "NCBIGene:6622"
FAKE_CURIE = "NOTAREALCURIE!"

Expand Down Expand Up @@ -133,7 +133,7 @@ def test_get_canonical_curies_simple():
synonymizer = NodeSynonymizer()
results = synonymizer.get_canonical_curies(curies)
print(results)
assert(len(results) == 3)
assert len(results) == 3
for curie in curies:
assert results.get(curie)
assert {"preferred_name", "preferred_category", "preferred_curie"} == set(results[curie])
Expand Down Expand Up @@ -269,6 +269,28 @@ def test_get_equivalent_nodes_by_curies_and_names():
assert ACETAMINOPHEN_CURIE_2 in results[ACETAMINOPHEN_CURIE]


def test_get_curie_names():
curies = [ACETAMINOPHEN_CURIE, ACETAMINOPHEN_CURIE_2]
synonymizer = NodeSynonymizer()
results = synonymizer.get_curie_names(curies)
print(results)
assert len(results) == 2
for curie in curies:
assert results.get(curie)
assert len(set(results.values())) == 2 # Names should be distinct


def test_get_preferred_names():
curies = [ATRIAL_FIBRILLATION_CURIE, IBUPROFEN_CURIE, SNCA_CURIE]
synonymizer = NodeSynonymizer()
results = synonymizer.get_preferred_names(curies)
print(results)
assert len(results) == 3
for curie in curies:
assert results.get(curie)
assert len(set(results.values())) == 3 # Preferred names for different concepts should be distinct


def test_get_normalizer_results():
synonymizer = NodeSynonymizer()
input_entities = [PARKINSONS_CURIE, CERVICAL_RIB_NAME, IBUPROFEN_CURIE, FAKE_NAME]
Expand Down

0 comments on commit 9055f97

Please sign in to comment.