-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSemanticReferenceCount.py
230 lines (193 loc) · 13.2 KB
/
SemanticReferenceCount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import time
from ElsevierAPI.pandas.panda_tricks import df,pd
import argparse
import textwrap
from ElsevierAPI.ResnetAPI.PathwayStudioGOQL import OQL
from ElsevierAPI.ResnetAPI.SemanticSearch import SemanticSearch,len, ResnetGraph
from ElsevierAPI.ResnetAPI.ResnetAPISession import SNIPPET_PROPERTIES,BIBLIO_PROPERTIES, ALL_PROPERTIES
if __name__ == "__main__":
EntityListFile = str()
link2concepts = df()
instructions = '''
infile - tab-delimted file with entity idnetifiers. Specify identifier type in each column header.
Identifiers will be used in the order of the columns: if entity is not found by identifier from the first column, identfier from the second column will be used.
infile_has_header - specifies if infile has a header. Defaults to no header. Use "Y" if header is present.
If header is not specified only identifiers in the 1st column will be used for mapping by Name+Alias to database entities
entity_types - comma-separated objectTypeNames for entities in infile
expand - comma-separated objectTypeNames to include corresponding neighbors for each Entity into Semantic search
example: -e GeneticVariant
concepts_file - tab-delimted file. Header: SearchPropertyName<>Effect<>RelationType<>Direction
SearchPropertyName - Required. concepts that must be semantically linked to entities from infile. Header value defines what properties must be used to retreive concepts. Defaults to 'Name,Alias'
Effect - optional. The effect sign ebetween entity in infile and concept in targets_file. Defaults to any.
RelationType - optional. The type of relation between entity in infile and concept in targets_file. Defaults to any.
Direction - optional. The direction of relation between entity in infile and concept in targets_file. Allowed values: '>', '<',''. Defaults to any.
pathways - comma-separated list of pathway names which must be semantically linked to entities from infile.
Semantic reference count to a pathway is calculated as sum of reference count of semantic links to every pathway component
model - same as "pathway" only all pathways are combined into one model
Output: tab-delimted file with rows from infile annotated with semantic reference count between entity (row) and semantic concept (column)
add_references - create additional dump file containing all references supporitng relations found by the script
retreive_entity_props - list of entity properties to include into dump file
use_cache_to_resume - must be 'Y' if you need to continue interrupted download. Will look for map_file.tsv,reference_cache.tsv,counts_cache.tsv to load pre-mapped database identifiers - saves time when infile is re-used. Defaults to start from scratch.
'''
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog=textwrap.dedent(instructions))
parser.add_argument('-i', '--infile', type=str)
parser.add_argument('-e', '--expand', type=str, default='')
parser.add_argument('-r', '--byrels', type=str, default='')
parser.add_argument('-H', '--infile_has_header',action='store_true')
parser.add_argument('-E', '--entity_types', type=str,default='')
parser.add_argument('-f', '--concepts_file', type=str)
parser.add_argument('-d', '--add_references', action='store_true')
parser.add_argument('-a', '--retreive_entity_props', default='')
parser.add_argument('-p', '--pathways', type=str, default='')
parser.add_argument('-m', '--model', type=str, default='')
parser.add_argument('-c', '--use_cache_to_resume', action='store_true')
parser.add_argument('--debug', action="store_true")
parser.add_argument('-t', '--useETM',action='store_true')
args = parser.parse_args()
if args.add_references:
what2retrieve = SNIPPET_PROPERTIES
else:
what2retrieve = BIBLIO_PROPERTIES
expand2 = str(args.expand).split(',') if args.expand else []
by_rels = str(args.byrels).split(',') if args.byrels else []
report_fname = str()
concepts_count = 0
if args.use_cache_to_resume:
search = SemanticSearch(what2retrieve=what2retrieve)
search.RefCountPandas = search.load_pandas(None,None,use_cache=True)
EntityPandas = df.copy_df(search.RefCountPandas)
elif args.infile:
# Script will work faster if you specify what list of object types in your input Entity list
search = SemanticSearch(what2retrieve=what2retrieve)
header_pos = 0 if args.infile_has_header else None
if args.entity_types:
entity_types = str(args.entity_types)
entity_types = entity_types.replace("Small molecule",'SmallMol')
entity_types = str(entity_types).split(',')
else:
entity_types = list()
EntityListFile = str(args.infile) #full path to tab-delimted file with drugs in the first column
link2concepts = df.read(args.concepts_file, header=0)
concepts_count = len(link2concepts)
EntityPandas = df.read(EntityListFile, header=header_pos)
if EntityPandas.empty:
print(f'Cannot read {EntityListFile}')
else:
search.RefCountPandas = search.load_pandas(EntityPandas,args.infile_has_header,map2type=entity_types,expand2=expand2,with_rels=by_rels)
AnnotateNodesWith = str(args.retreive_entity_props).split(',') if args.retreive_entity_props else []
search.add_ent_props(AnnotateNodesWith)
report_fname = EntityListFile[:len(EntityListFile)-4]+'+SemanticRefcount'
else: # custom script option to retrieve entities from database
# add your code here to select entities from database
# Example below finds proteins linked to 'cardiovascular disease' and all its ontology subcategories
# it then loads found proteins into RefCountPandas for semantic search
search = SemanticSearch(what2retrieve=what2retrieve)
entity_types = ['Protein']
entity_types_str = ','.join(entity_types)
AnnotateNodesWith = ['Class']
concept_name = 'cardiovascular disease'
disease = OQL.get_childs(PropertyValues=[concept_name],SearchByProperties=['Name','Alias'],include_parents=True)
relations = 'SELECT Relation WHERE objectType =(Regulation,QuantitativeChange,Biomarker,GeneticChange,StateChange)'
oql_query = f"SELECT Entity WHERE objectType=({entity_types_str}) AND Organism = 'Homo sapiens' AND Connected by ({relations}) to ({disease})"
entity_graph = search.process_oql(oql_query, 'Find disease targets')
id2names = entity_graph.get_properties('Name')
protein_names = [v[0] for v in id2names.values()]
link2concepts = df.from_dict({'Name':[concept_name]})
link2concepts.set_index('Name',inplace=True)
EntityPandas = df.from_dict({'Name':protein_names})
search.RefCountPandas = search.load_pandas(EntityPandas,prop_names_in_header=True, map2type=entity_types)
report_fname = 'SemanticRefcount4entities linked to {}'.format(concept_name)
#If you want to use identifiers other than names enter appropriate Propeprty types into SearchByProperties list
#consult with "Resnet Entities&Properties.txt" for the list of available identifier for nodes in the knowledge graph
global_start_time = time.time()
if args.useETM:
print('adding references from ETM')
for concept_idx in link2concepts.index:
link2concept = link2concepts.iat[concept_idx,0]
search.RefCountPandas = search.etm_refs2df(search.RefCountPandas,[link2concept])
search.add_etm_bibliography()
if args.model:
LinkToPathways = str(args.model).split(",")
print(f"Begin linking entities mapped from infile to model containing {len(LinkToPathways)} pathways")
start_time = time.time()
all_components = set()
for PathwayName in LinkToPathways:
PathwayMembersId2Entity = search.get_pathway_members(
[PathwayName], search_pathways_by=['Name'], only_entities=['Protein', 'FunctionalClass', 'Complex'], with_properties=['objectType'])
pathway_components = set(PathwayMembersId2Entity.values())
all_components.update(pathway_components)
pathway_components_dbids = ResnetGraph.dbids(list(pathway_components))
QueryOntology = OQL.get_childs(list(pathway_components_dbids), ['id'])
children = search.process_oql(QueryOntology,f'Find ontology children for {len(pathway_components_dbids)} pathway components')
all_components.update(children._get_nodes())
if len(pathway_components) == 0:
print ('No entity for %s found in the database' % PathwayName)
if all_components:
how2connect = search.set_how2connect()
search.link2RefCountPandas(f'Model: {(str(args.model))}', list(all_components),how2connect)
search.RefCountPandas['Is in model'] = search.RefCountPandas[search.__temp_id_col__].apply(lambda x: ','.join([n.name() for n in all_components.intersection(search.Graph.psobj_with_dbids(set(x)))]))
exec_time = search.execution_time(start_time)[0]
print(f"Entities in file {EntityListFile} were linked to model with {len(all_components)} components in {exec_time}")
else:
print('No pathways were specified for semantic linking with entities from \"%s\"' % (EntityListFile))
if args.pathways:
print("Begin linking entities mapped from infile to pathways")
LinkToPathways = str(args.pathways).split(",")
start_time = time.time()
for PathwayName in LinkToPathways:
PathwayMembersId2Entity = search.get_pathway_members(
[PathwayName], search_pathways_by=['Name'], only_entities=['Protein', 'FunctionalClass', 'Complex'], with_properties=['objectType'])
pathway_components = set(PathwayMembersId2Entity.values())
pathway_components_dbids = ResnetGraph.dbids(list(pathway_components))
QueryOntology = OQL.get_childs(list(pathway_components_dbids), ['id'])
children = search.process_oql(QueryOntology,f'Find ontology children for {len(pathway_components_dbids)} pathway components')
pathway_components.update(children._get_nodes())
if len(pathway_components) == 0:
print ('No entity for %s found in the database' % PathwayName)
else:
how2connect = search.set_how2connect()
search.link2RefCountPandas(PathwayName, list(pathway_components),how2connect)
exec_time = search.execution_time(start_time)
print("Entities in file %s were linked to %s pathway in %s" % (EntityListFile, PathwayName, exec_time))
else:
print('No pathways were specified for semantic linking with entities from \"%s\"' % (EntityListFile))
if concepts_count:
search_concept_by = str(link2concepts.columns[0]).split(',')
print ('Will search %d concepts by %s' % (len(link2concepts),link2concepts.index.name))
print("\nBegin linking entities mapped from infile to %d concepts" % (concepts_count))
for concept_idx in link2concepts.index:
link2concept = link2concepts.iat[concept_idx,0]
concepts = search._props2psobj(
propValues=[link2concept], search_by_properties=search_concept_by)
if len(concepts) == 0: print ('No entity %s found in database' % link2concept)
else:
connect_by_rels = list()
rel_effect = list()
rel_dir = ''
if 'RelationType' in link2concepts.columns:
rel_t = link2concepts.loc[concept_idx,'RelationType']
if pd.notna(rel_t): connect_by_rels= str(rel_t).split(',')
if 'Effect' in link2concepts.columns:
ef = link2concepts.loc[concept_idx,'Effect']
if pd.notna(ef): rel_effect = str(ef).split(',')
if 'Direction' in link2concepts.columns:
dr = link2concepts.loc[concept_idx,'Direction']
if pd.notna(dr): rel_dir = str(dr)
how2connect = search.set_how2connect(connect_by_rels,rel_effect,rel_dir)
linked_entities_count= search.link2RefCountPandas(link2concept,list(concepts),how2connect)
print("Total %d out of %d concepts interrogated in %s" %
(concept_idx + 1, len(link2concepts), search.execution_time(global_start_time)))
exec_time = search.execution_time(global_start_time)[0]
print("%d entities in file %s were mapped and linked to %d concepts from %s in %s" %
(len(search.RefCountPandas), EntityListFile, concepts_count, args.concepts_file, exec_time))
EntityPandasCounts = EntityPandas.merge_df(search.RefCountPandas,on='Name',how='left',name=search.RefCountPandas._name_)
EntityPandasCounts.copy_format(search.RefCountPandas)
search.RefCountPandas = EntityPandasCounts
search.add2report(search.make_count_df())
if args.add_references:
snippets_df = search.Graph.snippets2df()
search.add2report(snippets_df)
else:
search.add_ps_bibliography()
if report_fname:
search.print_report(report_fname+'.xlsx')