From 9055f975b858e6bb471d7aee0ff9e2f9f851a799 Mon Sep 17 00:00:00 2001 From: amykglen Date: Thu, 22 Aug 2024 14:12:26 -0700 Subject: [PATCH] Add streamlined method for getting curie names #2336 --- .../ARAX/ARAXQuery/Expand/expand_utilities.py | 29 +---- code/ARAX/NodeSynonymizer/node_synonymizer.py | 101 ++++++++++++++++-- code/ARAX/test/test_ARAX_synonymizer.py | 26 ++++- 3 files changed, 119 insertions(+), 37 deletions(-) diff --git a/code/ARAX/ARAXQuery/Expand/expand_utilities.py b/code/ARAX/ARAXQuery/Expand/expand_utilities.py index 544ef20c3..85e46b923 100644 --- a/code/ARAX/ARAXQuery/Expand/expand_utilities.py +++ b/code/ARAX/ARAXQuery/Expand/expand_utilities.py @@ -409,33 +409,8 @@ def get_preferred_categories(curie: Union[str, List[str]], log: ARAXResponse) -> def get_curie_names(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, str]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() - log.debug(f"Looking up names for {len(curies)} input curies using NodeSynonymizer") - synonymizer_info = synonymizer.get_normalizer_results(curies) - curie_to_name_map = dict() - if synonymizer_info: - recognized_input_curies = {input_curie for input_curie in synonymizer_info if synonymizer_info.get(input_curie)} - unrecognized_curies = set(curies).difference(recognized_input_curies) - if unrecognized_curies: - log.warning(f"NodeSynonymizer did not recognize: {unrecognized_curies}") - input_curies_without_matching_node = set() - for input_curie in recognized_input_curies: - equivalent_nodes = synonymizer_info[input_curie]["nodes"] - # Find the 'node' in the synonymizer corresponding to this curie - input_curie_nodes = [node for node in equivalent_nodes if node["identifier"] == input_curie] - if not input_curie_nodes: - # Try looking for slight variation (KG2 vs. SRI discrepancy): "KEGG:C02700" vs. "KEGG.COMPOUND:C02700" - input_curie_stripped = input_curie.replace(".COMPOUND", "") - input_curie_nodes = [node for node in equivalent_nodes if node["identifier"] == input_curie_stripped] - # Record the name for this input curie - if input_curie_nodes: - curie_to_name_map[input_curie] = input_curie_nodes[0].get("label") - else: - input_curies_without_matching_node.add(input_curie) - if input_curies_without_matching_node: - log.warning(f"No matching nodes found in NodeSynonymizer for these input curies: " - f"{input_curies_without_matching_node}. Cannot determine their specific names.") - else: - log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") + log.debug(f"Looking up names for {len(curies)} input curies using NodeSynonymizer.get_curie_names()") + curie_to_name_map = synonymizer.get_curie_names(curies) return curie_to_name_map diff --git a/code/ARAX/NodeSynonymizer/node_synonymizer.py b/code/ARAX/NodeSynonymizer/node_synonymizer.py index 5e62726d8..8abf890be 100644 --- a/code/ARAX/NodeSynonymizer/node_synonymizer.py +++ b/code/ARAX/NodeSynonymizer/node_synonymizer.py @@ -6,6 +6,7 @@ import sqlite3 import string import sys +import time from collections import defaultdict from typing import Optional, Union, List, Set, Dict, Tuple @@ -56,7 +57,9 @@ def __del__(self): def get_canonical_curies(self, curies: Optional[Union[str, Set[str], List[str]]] = None, names: Optional[Union[str, Set[str], List[str]]] = None, - return_all_categories: bool = False) -> dict: + return_all_categories: bool = False, + debug: bool = False) -> dict: + start = time.time() # Convert any input values to Set format curies_set = self._convert_to_set_format(curies) @@ -144,11 +147,15 @@ def get_canonical_curies(self, curies: Optional[Union[str, Set[str], List[str]]] for unrecognized_value in unrecognized_input_values: results_dict[unrecognized_value] = None + if debug: + print(f"Took {round(time.time() - start, 5)} seconds") return results_dict def get_equivalent_nodes(self, curies: Optional[Union[str, Set[str], List[str]]] = None, names: Optional[Union[str, Set[str], List[str]]] = None, - include_unrecognized_entities: bool = True) -> dict: + include_unrecognized_entities: bool = True, + debug: bool = False) -> dict: + start = time.time() # Convert any input values to Set format curies_set = self._convert_to_set_format(curies) @@ -210,9 +217,77 @@ def get_equivalent_nodes(self, curies: Optional[Union[str, Set[str], List[str]]] for unrecognized_curie in unrecognized_curies: results_dict[unrecognized_curie] = None + if debug: + print(f"Took {round(time.time() - start, 5)} seconds") return results_dict - def get_normalizer_results(self, entities: Optional[Union[str, Set[str], List[str]]]) -> dict: + def get_preferred_names(self, curies: Union[str, Set[str], List[str]], debug: bool = False) -> dict: + """ + Returns preferred names for input curies - i.e., the name of the curie's canonical identifier. + """ + start = time.time() + + # Convert any input values to Set format + curies_set = self._convert_to_set_format(curies) + results_dict = dict() + + if curies_set: + # First transform curies so that their prefixes are entirely uppercase + curies_to_capitalized_curies, capitalized_curies = self._map_to_capitalized_curies(curies_set) + + # Query the synonymizer sqlite database for these identifiers (in batches, if necessary) + sql_query_template = f""" + SELECT N.id_simplified, C.name + FROM nodes as N + INNER JOIN clusters as C on C.cluster_id == N.cluster_id + WHERE N.id_simplified in ('{self.placeholder_lookup_values_str}')""" + matching_rows = self._run_sql_query_in_batches(sql_query_template, capitalized_curies) + + # Transform the results into the proper response format + results_dict_capitalized = {row[0]: row[1] for row in matching_rows} + results_dict = {input_curie: results_dict_capitalized[capitalized_curie] + for input_curie, capitalized_curie in curies_to_capitalized_curies.items() + if capitalized_curie in results_dict_capitalized} + + if debug: + print(f"Took {round(time.time() - start, 5)} seconds") + return results_dict + + def get_curie_names(self, curies: Union[str, Set[str], List[str]], debug: bool = False) -> dict: + """ + Returns NON-preferred names for input curies; i.e., the curie's direct name, not the name of its canonical + identifier. + """ + start = time.time() + + # Convert any input values to Set format + curies_set = self._convert_to_set_format(curies) + results_dict = dict() + + if curies_set: + # First transform curies so that their prefixes are entirely uppercase + curies_to_capitalized_curies, capitalized_curies = self._map_to_capitalized_curies(curies_set) + + # Query the synonymizer sqlite database for these identifiers (in batches, if necessary) + sql_query_template = f""" + SELECT N.id_simplified, N.name + FROM nodes as N + WHERE N.id_simplified in ('{self.placeholder_lookup_values_str}')""" + matching_rows = self._run_sql_query_in_batches(sql_query_template, capitalized_curies) + + # Transform the results into the proper response format + results_dict_capitalized = {row[0]: row[1] for row in matching_rows} + results_dict = {input_curie: results_dict_capitalized[capitalized_curie] + for input_curie, capitalized_curie in curies_to_capitalized_curies.items() + if capitalized_curie in results_dict_capitalized} + + if debug: + print(f"Took {round(time.time() - start, 5)} seconds") + return results_dict + + def get_normalizer_results(self, entities: Optional[Union[str, Set[str], List[str]]], + debug: bool = False) -> dict: + start = time.time() # First handle any special input from /entity endpoint output_format = None @@ -298,6 +373,8 @@ def get_normalizer_results(self, entities: Optional[Union[str, Set[str], List[st if normalizer_info: normalizer_info["knowledge_graph"] = self._get_cluster_graph(normalizer_info) + if debug: + print(f"Took {round(time.time() - start, 5)} seconds") return results_dict # ---------------------------------------- EXTERNAL DEBUG METHODS --------------------------------------------- # @@ -574,6 +651,8 @@ def main(): arg_parser.add_argument("-c", "--canonical", dest="canonical", action="store_true") arg_parser.add_argument("-e", "--equivalent", dest="equivalent", action="store_true") arg_parser.add_argument("-n", "--normalizer", dest="normalizer", action="store_true") + arg_parser.add_argument("-l", "--names", dest="names", action="store_true") + arg_parser.add_argument("-p", "--preferrednames", dest="preferred_names", action="store_true") # Add a couple other data viewing options (tabular and TRAPI cluster graph format) arg_parser.add_argument("-t", "--table", dest="table", action="store_true") arg_parser.add_argument("-g", "--graph", dest="graph", action="store_true") @@ -581,20 +660,26 @@ def main(): synonymizer = NodeSynonymizer() if args.canonical: - results = synonymizer.get_canonical_curies(curies=args.curie_or_name) + results = synonymizer.get_canonical_curies(curies=args.curie_or_name, debug=True) if not results[args.curie_or_name]: results = synonymizer.get_canonical_curies(names=args.curie_or_name) print(json.dumps(results, indent=2)) if args.equivalent: - results = synonymizer.get_equivalent_nodes(curies=args.curie_or_name) + results = synonymizer.get_equivalent_nodes(curies=args.curie_or_name, debug=True) if not results[args.curie_or_name]: - results = synonymizer.get_equivalent_nodes(names=args.curie_or_name) + results = synonymizer.get_equivalent_nodes(names=args.curie_or_name, debug=True) print(json.dumps(results, indent=2)) if args.normalizer: - results = synonymizer.get_normalizer_results(entities=args.curie_or_name) + results = synonymizer.get_normalizer_results(entities=args.curie_or_name, debug=True) + print(json.dumps(results, indent=2)) + if args.names: + results = synonymizer.get_curie_names(curies=args.curie_or_name, debug=True) + print(json.dumps(results, indent=2)) + if args.preferred_names: + results = synonymizer.get_preferred_names(curies=args.curie_or_name, debug=True) print(json.dumps(results, indent=2)) # Default to printing the tabular view of the cluster if nothing else was specified - if args.table or (not args.canonical and not args.equivalent and not args.normalizer and not args.graph): + if args.table or not (args.canonical or args.equivalent or args.normalizer or args.names or args.preferred_names or args.graph): synonymizer.print_cluster_table(args.curie_or_name) diff --git a/code/ARAX/test/test_ARAX_synonymizer.py b/code/ARAX/test/test_ARAX_synonymizer.py index 3d485772e..945a87a48 100644 --- a/code/ARAX/test/test_ARAX_synonymizer.py +++ b/code/ARAX/test/test_ARAX_synonymizer.py @@ -20,7 +20,7 @@ PARKINSONS_CURIE_2 = "MONDO:0005180" IBUPROFEN_CURIE = "DRUGBANK:DB01050" ACETAMINOPHEN_CURIE = "CHEMBL.COMPOUND:CHEMBL112" -ACETAMINOPHEN_CURIE_2 = "DRUGBANK:DB00316" +ACETAMINOPHEN_CURIE_2 = "CHEBI:46195" SNCA_CURIE = "NCBIGene:6622" FAKE_CURIE = "NOTAREALCURIE!" @@ -133,7 +133,7 @@ def test_get_canonical_curies_simple(): synonymizer = NodeSynonymizer() results = synonymizer.get_canonical_curies(curies) print(results) - assert(len(results) == 3) + assert len(results) == 3 for curie in curies: assert results.get(curie) assert {"preferred_name", "preferred_category", "preferred_curie"} == set(results[curie]) @@ -269,6 +269,28 @@ def test_get_equivalent_nodes_by_curies_and_names(): assert ACETAMINOPHEN_CURIE_2 in results[ACETAMINOPHEN_CURIE] +def test_get_curie_names(): + curies = [ACETAMINOPHEN_CURIE, ACETAMINOPHEN_CURIE_2] + synonymizer = NodeSynonymizer() + results = synonymizer.get_curie_names(curies) + print(results) + assert len(results) == 2 + for curie in curies: + assert results.get(curie) + assert len(set(results.values())) == 2 # Names should be distinct + + +def test_get_preferred_names(): + curies = [ATRIAL_FIBRILLATION_CURIE, IBUPROFEN_CURIE, SNCA_CURIE] + synonymizer = NodeSynonymizer() + results = synonymizer.get_preferred_names(curies) + print(results) + assert len(results) == 3 + for curie in curies: + assert results.get(curie) + assert len(set(results.values())) == 3 # Preferred names for different concepts should be distinct + + def test_get_normalizer_results(): synonymizer = NodeSynonymizer() input_entities = [PARKINSONS_CURIE, CERVICAL_RIB_NAME, IBUPROFEN_CURIE, FAKE_NAME]