Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some update #2355

Merged
merged 11 commits into from
Aug 26, 2024
138 changes: 78 additions & 60 deletions code/ARAX/ARAXQuery/ARAX_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,24 @@ def _normalize_number_of_edges(edge_number):

return normalized_value

def _normalize_number_of_drugbank_edges(drugbank_edge_number):

def _calculate_final_individual_edge_confidence(base_score: int, attribute_scores: List[float]) -> float:

sorted_attribute_scores = sorted(attribute_scores, reverse=True)

# use Eric's loop algorithm
W_r = base_score

for W_i in attribute_scores:
W_r = W_r + (1 - W_r) * W_i

return W_r

def _normalize_number_of_goldsource_edges(goldsource_edge_number):
"""
Normalize the number of drugbank edges to be between 0 and 1
"""
value = drugbank_edge_number
value = goldsource_edge_number
max_value = 1.0
curve_steepness = 3
midpoint = 0
Expand All @@ -66,43 +79,38 @@ def _normalize_number_of_drugbank_edges(drugbank_edge_number):

return normalized_value

def _calculate_final_edge_score(kg_edge_id_to_edge: Dict[str, Edge], edge_binding_list: List[Dict], alpha: float = 0.8, beta: float = 0.1) -> float:
def _calculate_final_result_score(kg_edge_id_to_edge: Dict[str, Edge], edge_binding_list: List[Dict]) -> float:
"""
Calculate the final edge score for a given edge binding list considering the individual base edge confidence scores, the number of edges, and the
presence of drugbank edges. The algorithm is as follows:
final_score= alpha x max_score + beta x normalized_edge_count + gamma x drugbank_proportion
Calculate the final result score for a given edge binding list considering the individual base edge confidence scores. The looping aglorithm is used:
W_r = W_r + (1 - W_r) * W_i

1. to consider the individual base edge confidence scores, the max score of all edge confidence is calculated.
max_score = max([edge.confidence for edge in edge_binding_list])
Here are the steps:
1. sort all edge scores in descending order
2. use looping algorithm to combine all sorted edge scores

2. to consider the number of edges, the normalized edge count is calculated.
normalized_edge_count = _normalize_number_of_edges(# of non-semmeddb nonvirtual edges)

3. to consider the presence of drugbank edges, the drugbank edge count is calculated.
normalized_drugbank_edge_count = _normalize_number_of_drugbank_edges(# of drugbank edges)
Here is an example:
Given score list: 0.994, 0.93, 0.85, 0.68

We have:
Round W_i W_r
1 0.994 0.994
2 0.93 0.99958
3 0.85 0.999937
4 0.68 0.99997984
Final result score = 0.99997984

Parameters:
kg_edge_id_to_edge (Dict[str, Edge]): A dictionary mapping edge IDs to Edge objects.
edge_binding_list (List[Dict]): A list of dictionaries containing edge bindings.
alpha (float): Weight for the average score of edges.
beta (float): Weight for the normalized number of edges.
Returns:
float: The final combined score between 0 and 1.
"""

# Calculate the max score of all edge confidences
max_score = max([kg_edge_id_to_edge[edge_binding.id].confidence for edge_binding in edge_binding_list])

# Calculate the number of non-semmeddb nonvirtual edges
number_of_non_semmdb_nonvirtual_edges = len([edge_binding.id for edge_binding in edge_binding_list if 'infores:' in edge_binding.id and edge_binding.id.split('--')[-1] != 'infores:semmeddb'])
normalized_edge_count = _normalize_number_of_edges(number_of_non_semmdb_nonvirtual_edges)

# Calculate the number of drugbank edges
drugbank_edge_count = len([edge_binding.id for edge_binding in edge_binding_list if edge_binding.id.split('--')[-1] == 'infores:drugbank'])
normalized_drugbank_edge_count = _normalize_number_of_drugbank_edges(drugbank_edge_count)
# Calculate final result score
all_edge_scores = [kg_edge_id_to_edge[edge_binding.id].confidence for edge_binding in edge_binding_list]

# Calculate the final score
final_score = alpha * max_score + beta * normalized_edge_count + (1 - alpha - beta) * normalized_drugbank_edge_count
final_score = _calculate_final_individual_edge_confidence(0, all_edge_scores)

return final_score

Expand All @@ -117,7 +125,7 @@ def _get_weighted_graph_networkx_from_result_graph(kg_edge_id_to_edge: Dict[str,
for analysis in result.analyses: # For now we only ever have one Analysis per Result
for qedge_key, edge_binding_list in analysis.edge_bindings.items():
qedge_tuple = qg_edge_key_to_edge_tuple[qedge_key]
res_graph[qedge_tuple[0]][qedge_tuple[1]][qedge_tuple[2]]['weight'] = _calculate_final_edge_score(kg_edge_id_to_edge, edge_binding_list)
res_graph[qedge_tuple[0]][qedge_tuple[1]][qedge_tuple[2]]['weight'] = _calculate_final_result_score(kg_edge_id_to_edge, edge_binding_list)

return res_graph

Expand Down Expand Up @@ -260,14 +268,8 @@ def __init__(self):
self.response = None
self.message = None
self.parameters = None
# edge attributes we know about
self.known_attributes = {'probability', 'normalized_google_distance', 'jaccard_index',
'probability_treats', 'paired_concept_frequency',
'observed_expected_ratio', 'chi_square', 'chi_square_pvalue', 'MAGMA-pvalue', 'Genetics-quantile',
'pValue', 'fisher_exact_test_p-value','Richards-effector-genes',
'feature_coefficient', 'CMAP similarity score'}
# how much we trust each of the edge attributes
self.known_attributes_to_trust = {'probability': 0.5,
self.known_attributes_to_trust = {'probability': 0.8,
'normalized_google_distance': 0.8,
'jaccard_index': 0.5,
'probability_treats': 0.8,
Expand All @@ -282,9 +284,15 @@ def __init__(self):
'Richards-effector-genes': 0.5,
'feature_coefficient': 1.0,
'CMAP similarity score': 1.0,
'publications': 0.5, # downweight publications (including those from semmeddb)
'text-mining-provider': 0.8
}
# how much we trust each data source
self.data_source_base_weights = {'infores:semmeddb': 0.5, # downweight semmeddb
'infores:text-mining-provider': 0.85,
'infores:drugcentral': 0.93,
'infores:drugbank': 0.99
# we can define the more customized weights for other data sources here later if needed.
}

self.virtual_edge_types = {}
self.score_stats = dict() # dictionary that stores that max's and min's of the edge attribute values
self.kg_edge_id_to_edge = dict() # map between the edge id's in the results and the actual edges themselves
Expand Down Expand Up @@ -341,50 +349,60 @@ def result_confidence_maker(self, result):
# then assign result confidence as average/median of these "single" edge confidences?
result.confidence = 1

def edge_attribute_score_combiner(self, edge):
def edge_attribute_score_combiner(self, edge_key, edge):
"""
This function takes a single edge and decides how to combine its attribute scores into a single confidence
Eventually we will want
1. To weight different attributes by different amounts
2. Figure out what to do with edges that have no attributes
"""
edge_best_score = 1
edge_score_list = []
edge_attribute_dict = {}
edge_default_base = 0.75
edge_attribute_score_list = []

# find data source from edge_key
if edge_key.split('--')[-1] in self.data_source_base_weights:
base = self.data_source_base_weights[edge_key.split('--')[-1]]
elif 'infores' in edge_key.split('--')[-1]: # default score for other data sources
base = edge_default_base
else: # virtual edges or inferred edges
base = 0 # no base score for these edges. Its core is based on

if edge.attributes is not None:
for edge_attribute in edge.attributes:
if edge_attribute.original_attribute_name == "biolink:knowledge_level": # this probably means it's a fact or high-quality edge from reliable source, we tend to trust it.
edge_score_list.append(edge_best_score)
break

# if edge_attribute.original_attribute_name == "biolink:knowledge_level": # this probably means it's a fact or high-quality edge from reliable source, we tend to trust it.
# TODO: we might consider the value from this attrubute name in the future

# if a specific attribute found, normalize its score and add it to the list
if edge_attribute.original_attribute_name is not None:
edge_attribute_dict[edge_attribute.original_attribute_name] = edge_attribute.value
normalized_score = self.edge_attribute_score_normalizer(edge_attribute.original_attribute_name, edge_attribute.value)
else:
edge_attribute_dict[edge_attribute.attribute_type_id] = edge_attribute.value
normalized_score = self.edge_attribute_score_normalizer(edge_attribute.attribute_type_id, edge_attribute.value)
if edge_attribute.attribute_type_id == "biolink:publications":
if edge_attribute.attribute_type_id == "biolink:publications" and (edge_attribute.attribute_source is None or edge_attribute.attribute_source == "infores:semmeddb"):
# only publications from semmeddb are used to calculate the confidence in this way
normalized_score = self.edge_attribute_publication_normalizer(edge_attribute.attribute_type_id, edge_attribute.value)

if self.known_attributes_to_trust.get(edge_attribute.original_attribute_name, None) is not None:
edge_score_list.append(normalized_score * self.known_attributes_to_trust[edge_attribute.original_attribute_name])
elif edge_attribute.attribute_type_id == "biolink:publications":
edge_score_list.append(normalized_score * self.known_attributes_to_trust['publications'])
elif edge_attribute.attribute_type_id == "biolink:primary_knowledge_source" and edge_attribute.value == "infores:text-mining-provider-targeted":
edge_score_list.append(1 * self.known_attributes_to_trust['text-mining-provider'])

if self.known_attributes_to_trust.get(edge_attribute.original_attribute_name, None):
if normalized_score > 0:
edge_attribute_score_list.append(normalized_score * self.known_attributes_to_trust[edge_attribute.original_attribute_name])
elif self.known_attributes_to_trust.get(edge_attribute.attribute_type_id, None):
if normalized_score > 0:
edge_attribute_score_list.append(normalized_score * self.known_attributes_to_trust[edge_attribute.attribute_type_id])
elif edge_attribute.attribute_type_id == "biolink:publications" and (edge_attribute.attribute_source is None or edge_attribute.attribute_source == "infores:semmeddb"):
if normalized_score > 0:
edge_attribute_score_list.append(normalized_score)
else:
# this means we have no current normalization of this kind of attribute,
# so don't do anything to the score since we don't know what to do with it yet
# add more rules in the future
continue

if len(edge_score_list) == 0: # if no appropriate attribute for score calculation, set the confidence to 1
edge_confidence = edge_best_score
if len(edge_attribute_score_list) == 0: # if no appropriate attribute for score calculation, set the confidence to 1
edge_confidence = base
else:
edge_confidence = np.max(edge_score_list) # if attributes has multiple scores, take the largest one
edge_confidence = _calculate_final_individual_edge_confidence(base, edge_attribute_score_list)
else:
edge_confidence = edge_best_score
edge_confidence = base

return edge_confidence

Expand All @@ -393,7 +411,7 @@ def edge_attribute_score_normalizer(self, edge_attribute_name: str, edge_attribu
Takes an input edge attribute and value, dispatches it to the appropriate method that translates the value into
something in the interval [0,1] where 0 is worse and 1 is better
"""
if edge_attribute_name not in self.known_attributes:
if edge_attribute_name not in self.known_attributes_to_trust:
return -1 # TODO: might want to change this
else:
if edge_attribute_value == "no value!":
Expand Down Expand Up @@ -679,7 +697,7 @@ def aggregate_scores_dmk(self, response):
kg_edge_id_to_edge[edge_key] = edge
if edge.attributes is not None:
for edge_attribute in edge.attributes:
for attribute_name in self.known_attributes:
for attribute_name in self.known_attributes_to_trust:
if edge_attribute.original_attribute_name == attribute_name or edge_attribute.attribute_type_id == attribute_name:
if edge_attribute.value == "no value!":
edge_attribute.value = 0
Expand Down Expand Up @@ -731,7 +749,7 @@ def aggregate_scores_dmk(self, response):
edge.confidence = edge_attributes['confidence']
#continue
else:
confidence = self.edge_attribute_score_combiner(edge)
confidence = self.edge_attribute_score_combiner(edge_key, edge)
#edge.attributes.append(Attribute(name="confidence", value=confidence))
edge.confidence = confidence

Expand Down
8 changes: 4 additions & 4 deletions code/ARAX/test/test_ARAX_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def test_xcrg_infer_bomeol():
if len(creative_mode_edges) != 0:
edge_key = creative_mode_edges[0]
edge_result = message.knowledge_graph.edges[edge_key]
assert edge_result.predicate == 'biolink:regulates'
assert edge_result.predicate in ['biolink:regulates', 'biolink:affects']

@pytest.mark.slow
def test_xcrg_with_qg1():
Expand All @@ -263,7 +263,7 @@ def test_xcrg_with_qg1():
"r_edge": {
"object": "gene",
"subject": "chemical",
"predicates": ["biolink:regulates"],
"predicates": ['biolink:regulates', 'biolink:affects'],
"knowledge_type": "inferred",
"qualifier_constraints": [
{
Expand Down Expand Up @@ -313,7 +313,7 @@ def test_xcrg_with_qg2():
"r_edge": {
"object": "gene",
"subject": "chemical",
"predicates": ["biolink:regulates"],
"predicates": ['biolink:regulates', 'biolink:affects'],
"knowledge_type": "inferred",
"qualifier_constraints": [
{
Expand Down Expand Up @@ -362,7 +362,7 @@ def test_xcrg_with_only_qg():
"r_edge": {
"object": "gene",
"subject": "chemical",
"predicates": ["biolink:regulates"],
"predicates": ["biolink:regulates", "biolink:affects"],
"knowledge_type": "inferred",
"qualifier_constraints": [
{
Expand Down
Loading
Loading