-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathMetaboliteReport.py
129 lines (110 loc) · 7.37 KB
/
MetaboliteReport.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from ElsevierAPI import open_api_session
import pandas as pd
import os
from ElsevierAPI.ResnetAPI.PathwayStudioGOQL import OQL
from ElsevierAPI.ResnetAPI.NetworkxObjects import PROTEIN_TYPES
import time
from ElsevierAPI.ResnetAPI.ResnetGraph import ResnetGraph
COMMON_METABOLITES={'H2O','PPi', 'ATP','ADP','AMP','Pi','GDP','GTP','NADP+','NADPH+','NAD+','NADH+','acceptor','reduced acceptor',
'oxidized acceptor','oxygen','CoA'}
start_time = time.time()
excel_file_name = 'my_metabolites.xlsx'
excel_file_name = 'D:/Python/MDACC/211109_LipidData_MinhNguyen.xlsx'
input_excel = pd.read_excel(excel_file_name)
metabolite_column = 1
#metabolites names or aliases must be in the first column in Excel file.
input_metabolite_names = []
[input_metabolite_names.append(x) for x in input_excel[input_excel.columns[metabolite_column]] if x not in input_metabolite_names] # making alias list unique
#input_metabolite_names = input_metabolite_names[0:3]
# ps_api retreives data from the database and loads it into APISession.Graph derived from Networkx:MultiDiGraph
api_config = 'path2apiconfig.json'
api_config = 'D:/Python/ENTELLECT_API/ElsevierAPI/APIconfigMDACC.json'
ps_api = open_api_session(api_config) # specify here path to your APIconfig file. Defaults to ./ElsevierAPI/APIconfig.json
ps_api.add_ent_props(['Alias']) # need to retreive aliases from the database in case input metabolites are found by Alias
ps_api.PageSize = 10000
# dump file contains references for all relations retreived from database
# do not use dump file unless you need to include references into report:
ps_api.DumpFiles.clear()
# retreive all ChemicalReaction linked to metabolites in excel_file_name as ResnetGraph from the database:
step = 1000
for i in range(0,len(input_metabolite_names),step):
name_list = input_metabolite_names[i:i+step]
my_goql_query = OQL.expand_entity(name_list,['Name','Alias'], expand_by_rel_types=['ChemicalReaction'])
request_name ='Retrieve metabolic reactions graph for {count} metabolites'.format(count=len(name_list))
ps_api.process_oql(my_goql_query,request_name)
reactions_graph = ps_api.Graph
input_name2objs, objid2input_names = reactions_graph.get_prop2obj_dic('Name', input_metabolite_names)
aliasinput_2objs, objid2input_alias = reactions_graph.get_prop2obj_dic('Alias', input_metabolite_names)
objid2input_names.update(objid2input_alias)
metabolite_ids = list(objid2input_names.keys())
# objid2input_names = {obj_id:[input_names]} - allows for duplicates when mapping by name+alias
# find enzymes linked to ChemicalReactions and retreive their ontology children (proteins)
enzymes = reactions_graph.psobjs_with(only_with_values=PROTEIN_TYPES)
enzymes_ids = [x['Id'][0] for x in enzymes]
ps_api._props2psobj(enzymes_ids,['Id'], get_childs=True, only_obj_types=PROTEIN_TYPES) #loading ontology children for enzymes
#retrieved children are stored in ps_api.ID2Children structure
unique_rows= set() # duplicate row filter. Relations can be duplicated due to diffrent Owner,Effect,Mechanism
temp_report_file = 'temp_report.txt'
with open(temp_report_file, 'w', encoding='utf-8') as f:
f.write('Metabolite input name'+'\t'+'Metabolite name in Database'+'\t'+'direction'+'\t'+'Substrate or Product'+'\t'+'gene'+'\n')
for regulatorID, targetID, reaction in reactions_graph.edges.data('relation'):
if regulatorID in metabolite_ids:
target = reactions_graph._psobj(targetID)
if target['ObjTypeName'][0] == 'SmallMol' and target['Name'][0] not in COMMON_METABOLITES:
enzyme_objects, products = reactions_graph.find_nodes(for_relation=reaction,filter_by=PROTEIN_TYPES)# enzymes are always regulators in ChemicalReaction
gene_names = ps_api.get_children_props(for_psobjs=enzyme_objects,prop_name='Name')
gene_names = ','.join(gene_names)
input_metabolite_obj = reactions_graph._psobj(regulatorID)
metabolite_input_name = objid2input_names[regulatorID]
metabolite_input_name = ','.join(metabolite_input_name)
metabolite_db_name = input_metabolite_obj['Name'][0]
direction = '-->'
other_metabolite = target['Name'][0]
else: continue
elif targetID in metabolite_ids:
regulator = reactions_graph._psobj(regulatorID)
if regulator['ObjTypeName'][0] == 'SmallMol' and regulator['Name'][0] not in COMMON_METABOLITES:
enzyme_objects, products = reactions_graph.find_nodes(for_relation=reaction,filter_by=PROTEIN_TYPES)
gene_names = ps_api.get_children_props(enzyme_objects,'Name')
gene_names = ','.join(gene_names)
target = reactions_graph._psobj(targetID)
metabolite_input_name = objid2input_names[targetID]
metabolite_input_name = ','.join(metabolite_input_name)
metabolite_db_name = target['Name'][0]
direction = '<--'
other_metabolite = regulator['Name'][0]
else: continue
else: continue
tup = (metabolite_db_name,direction,other_metabolite,gene_names)
if tup not in unique_rows:
f.write(metabolite_input_name+'\t'+metabolite_db_name+'\t'+direction+'\t'+other_metabolite+'\t'+gene_names+'\n')
unique_rows.add(tup)
report = pd.read_csv(temp_report_file,sep='\t')
report.sort_values(['Metabolite input name','Substrate or Product'], inplace=True)
report.to_csv('Metabolite report.txt',index=False,sep='\t')
metabolites_with_reactions = set(report['Metabolite input name'])
print('Metabolite Report is generated in %s' % ps_api.execution_time(start_time))
print('Generating report on missing metabolites')
noreaction_metabolites = list(set(input_metabolite_names).difference(metabolites_with_reactions))
mapped_noreaction_graph = ResnetGraph()
for i in range(0,len(noreaction_metabolites),step):
name_list = noreaction_metabolites[i:i+step]
name_list_str = OQL.join_with_quotes(',',name_list)
oql_query = 'SELECT Entity WHERE (Name,Alias) = ({names})'.format(names=name_list_str)
request_name ='Retrieve metabolites for {count} input names'.format(count=len(name_list))
mapped_noreaction_graph.add_graph(ps_api.process_oql(my_goql_query,request_name))
all_input_name2objs, all_objid2input_names = mapped_noreaction_graph.get_prop2obj_dic('Name', input_metabolite_names)
all_aliasinput_2objs, all_objid2input_alias = mapped_noreaction_graph.get_prop2obj_dic('Alias', input_metabolite_names)
all_input_name2objs.update(all_aliasinput_2objs)
with open("mapped metabolites without reactions.txt", 'w', encoding='utf-8') as f:
f.write('Metabolite input name\tMetabolite name in Database\tMapped metabolite URNs\n')
for input_name, psobjects in all_input_name2objs.items():
mapped_urns = [x['URN'][0] for x in psobjects]
mapped_names = [x['Name'][0] for x in psobjects]
f.write(input_name+'\t'+','.join(mapped_names)+'\t'+','.join(mapped_urns) +'\n')
umappped_metabolites = set(noreaction_metabolites).difference(set(all_input_name2objs.keys()))
with open("unmapped metabolites.txt", 'w', encoding='utf-8') as f:
for m in umappped_metabolites:
f.write(m +'\n')
os.remove(temp_report_file)
print('Report is generated in %s' % ps_api.execution_time(start_time))