-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathSimilarityQuestionSolution.py
232 lines (190 loc) · 9.26 KB
/
SimilarityQuestionSolution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# This script will return X that are similar to Y based on high Jaccard index of common one-hop nodes Z (X<->Z<->Y)
import os
import sys
import argparse
# PyCharm doesn't play well with relative imports + python console + terminal
try:
from code.reasoningtool import ReasoningUtilities as RU
except ImportError:
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import ReasoningUtilities as RU
#### Import some Translator API classes
sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../../UI/OpenAPI/python-flask-server/")
from swagger_server.models.query_graph import QueryGraph
from swagger_server.models.q_node import QNode
from swagger_server.models.q_edge import QEdge
import FormatOutput
import SimilarNodesInCommon
import networkx as nx
class SimilarityQuestionSolution:
def __init__(self):
None
@staticmethod
def answer(source_node_ID, target_node_type, association_node_type, use_json=False, threshold=0.2, n=20):
"""
Answers the question what X are similar to Y based on overlap of common Z nodes. X is target_node_type,
Y is source_node_ID, Z is association_node_type. The relationships are automatically determined in
SimilarNodesInCommon by looking for 1 hop relationships and poping the FIRST one (you are warned).
:param source_node_ID: actual name in the KG
:param target_node_type: kinds of nodes you want returned
:param association_node_type: kind of node you are computing the Jaccard overlap on
:param use_json: print the results in standardized format
:param threshold: only return results where jaccard is >= this threshold
:param n: number of results to return (default 20)
:return: reponse (or printed text)
"""
# Initialize the response class
response = FormatOutput.FormatResponse(5)
# add the column names for the row data
response.message.table_column_names = ["source name", "source ID", "target name", "target ID", "Jaccard index"]
# Initialize the similar nodes class
similar_nodes_in_common = SimilarNodesInCommon.SimilarNodesInCommon()
# get the description
source_node_description = RU.get_node_property(source_node_ID, 'name')
# get the source node label
source_node_label = RU.get_node_property(source_node_ID, 'label')
# Get the nodes in common
node_jaccard_tuples_sorted, error_code, error_message = similar_nodes_in_common.get_similar_nodes_in_common_source_target_association(source_node_ID, target_node_type, association_node_type, threshold)
# reduce to top 100
if len(node_jaccard_tuples_sorted) > n:
node_jaccard_tuples_sorted = node_jaccard_tuples_sorted[0:n]
# make sure that the input node isn't in the list
node_jaccard_tuples_sorted = [i for i in node_jaccard_tuples_sorted if i[0] != source_node_ID]
# check for an error
if error_code is not None or error_message is not None:
if not use_json:
print(error_message)
return
else:
response.add_error_message(error_code, error_message)
response.print()
return
#### If use_json not specified, then return results as a fairly plain list
if not use_json:
to_print = "The %s's involving similar %ss as %s are: \n" % (target_node_type, association_node_type, source_node_description)
for other_disease_ID, jaccard in node_jaccard_tuples_sorted:
to_print += "%s\t%s\tJaccard %f\n" % (other_disease_ID, RU.get_node_property(other_disease_ID, 'name'), jaccard)
print(to_print)
#### Else if use_json requested, return the results in the Translator standard API JSON format
else:
#### Create the QueryGraph for this type of question
query_graph = QueryGraph()
source_node = QNode()
source_node.id = "n00"
source_node.curie = source_node_ID
source_node.type = source_node_label
association_node = QNode()
association_node.id = "n01"
association_node.type = association_node_type
association_node.is_set = True
target_node = QNode()
target_node.id = "n02"
target_node.type = target_node_type
query_graph.nodes = [ source_node,association_node,target_node ]
#source_association_relationship_type = "unknown1"
edge1 = QEdge()
edge1.id = "en00-n01"
edge1.source_id = "n00"
edge1.target_id = "n01"
#edge1.type = source_association_relationship_type
#association_target_relationship_type = "unknown2"
edge2 = QEdge()
edge2.id = "en01-n02"
edge2.source_id = "n01"
edge2.target_id = "n02"
#edge2.type = association_target_relationship_type
query_graph.edges = [ edge1,edge2 ]
#### DONT Suppress the query_graph because we can now do the knowledge_map with v0.9.1
response.message.query_graph = query_graph
#### Create a mapping dict with the source curie and node types and edge types. This dict is used for reverse lookups by type
#### for mapping to the QueryGraph. There is a potential point of failure here if there are duplicate node or edge types. FIXME
response._type_map = dict()
response._type_map[source_node.curie] = source_node.id
response._type_map[association_node.type] = association_node.id
response._type_map[target_node.type] = target_node.id
response._type_map["e"+edge1.source_id+"-"+edge1.target_id] = edge1.id
response._type_map["e"+edge2.source_id+"-"+edge2.target_id] = edge2.id
#### Extract the sorted IDs from the list of tuples
node_jaccard_ID_sorted = [id for id, jac in node_jaccard_tuples_sorted]
# print(RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted, target_node_type,
# [association_node_type], with_rel=[], directed=True, debug=True))
# get the entire subgraph
g = RU.return_subgraph_through_node_labels(source_node_ID, source_node_label, node_jaccard_ID_sorted,
target_node_type,
[association_node_type], with_rel=[], directed=False,
debug=False)
# extract the source_node_number
for node, data in g.nodes(data=True):
if data['properties']['id'] == source_node_ID:
source_node_number = node
break
# Get all the target numbers
target_id2numbers = dict()
node_jaccard_ID_sorted_set = set(node_jaccard_ID_sorted)
for node, data in g.nodes(data=True):
if data['properties']['id'] in node_jaccard_ID_sorted_set:
target_id2numbers[data['properties']['id']] = node
for other_disease_ID, jaccard in node_jaccard_tuples_sorted:
target_name = RU.get_node_property(other_disease_ID, 'name')
to_print = "The %s %s involves similar %ss as %s with similarity value %f" % (
target_node_type, target_name, association_node_type,
source_node_description, jaccard)
# get all the shortest paths between source and target
all_paths = nx.all_shortest_paths(g, source_node_number, target_id2numbers[other_disease_ID])
# get all the nodes on these paths
#try:
if 1 == 1:
rel_nodes = set()
for path in all_paths:
for node in path:
rel_nodes.add(node)
if rel_nodes:
# extract the relevant subgraph
sub_g = nx.subgraph(g, rel_nodes)
# add it to the response
res = response.add_subgraph(sub_g.nodes(data=True), sub_g.edges(data=True), to_print, jaccard, return_result=True)
res.essence = "%s" % target_name # populate with essence of question result
res.essence_type = target_node_type
row_data = [] # initialize the row data
row_data.append("%s" % source_node_description)
row_data.append("%s" % source_node_ID)
row_data.append("%s" % target_name)
row_data.append("%s" % other_disease_ID)
row_data.append("%f" % jaccard)
res.row_data = row_data
# except:
# pass
response.print()
@staticmethod
def describe():
output = "Answers questions of the form: 'What diseases involve similar genes to disease X?' where X is a disease." + "\n"
# TODO: subsample disease nodes
return output
def main():
parser = argparse.ArgumentParser(description="Answers questions of the type 'What X involve similar Y as Z?'.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-s', '--source', type=str, help="source node name (or other name of node in the KG)", default="DOID:8398")
parser.add_argument('-t', '--target', type=str, help="target node type", default="disease")
parser.add_argument('-a', '--association', type=str, help="association node type", default="phenotypic_feature")
parser.add_argument('-j', '--json', action='store_true', help='Flag specifying that results should be printed in JSON format (to stdout)', default=False)
parser.add_argument('--describe', action='store_true', help='Print a description of the question to stdout and quit', default=False)
parser.add_argument('--threshold', type=float, help='Jaccard index threshold (only report other diseases above this)', default=0.2)
parser.add_argument('-n', '--num_res', type=int, help='Maximum number of results to return', default=20)
# Parse and check args
args = parser.parse_args()
source_node_ID = args.source
use_json = args.json
describe_flag = args.describe
threshold = args.threshold
target_node_type = args.target
association_node_type = args.association
n = args.num_res
# Initialize the question class
Q = SimilarityQuestionSolution()
if describe_flag:
res = Q.describe()
print(res)
else:
Q.answer(source_node_ID, target_node_type, association_node_type, use_json=use_json, threshold=threshold, n=n)
if __name__ == "__main__":
main()