-
Notifications
You must be signed in to change notification settings - Fork 0
/
suspectlist_curation.py
279 lines (219 loc) · 9.85 KB
/
suspectlist_curation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# -*- coding: utf-8 -*-
"""Python-Exercise-SuspectList-Curation.ipynb
Automatically generated by Colaboratory.
# Suspect list Curation
## Installation of packages
This is a python script in the form of a google colab notebook. A notebook can give you results right when you run the code , just like a python console. Check which python version is used in this notebook
"""
from platform import python_version
print(python_version())
"""Once the packages are installed, you need to import them every time you open the notebook or script."""
# pandas package is used to read the csv files easily and modify them
import pandas as pd
# pubchempy is a package from PubChem Database, where we can access all entries within python in an automated manner without ggoing to the website. This is called API (application programming interface).
import pubchempy as pcp
# numpy is very similar to pandas, it also helps organize the data in a user friendly manner
import numpy as np
# this is just a function defined to check if any input (here called variable) is a string or not. String means any character or sentence or word, so not numbers.
def isNaN(string):
return string != string
#os and glob packages are used to fetch directories
import os
import glob
import re
from pybatchclassyfire import *
# csv package for certain csv files
import csv
# time package
import time
# Reading json files
import json
# normalizing json files
from pandas import json_normalize
# for normal functions in python we might need
import openpyxl
import statistics
import sys
#RDKit is a python cheminformatics toolkit
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS
from rdkit.Chem import PandasTools
sl = pd.read_csv('SuspectListCSV.csv')
# list of all columns in the suspect list
sl.columns
# introduce synonyms column to sl:
sl["synonyms"] = np.nan
# introduce PubChemPY column to sl; which will add information on what kind of alteration is done to the cell
sl["PubChemPY"] = np.nan
# Here you have to think: there can be wrong syntax SMILES so for that:
#define an empty variable for entries with smiles that give error for any reason, either wrong SMILES or PubChem error:
no_entries_pb = []
# for each row in the suspect list
for index, row in sl.iterrows():
#if SMILE cell is not empty; here you can see: suspectlist["column_name"][row_number] is same as sl["SMILES"][index]
# isNaN was function we defined earlier to check if any entry/cell is empty or contains a string of characters
if not isNaN(sl["SMILES"][index]):
#save the smiles for each entry in a variable
sm = sl["SMILES"][index]
try:
#fetch compound from PubChem using SMILES
comp = pcp.get_compounds(sm, 'smiles')
# if the comp is not empty (pubchem has an entry with this SMILES)
if comp:
for c in comp:
# add Pubchem ID
sl["PubChemId"][index] = c.cid
# add inchi
sl["InChI"][index]= c.inchi
#add iupac
sl["iupac"][index]= c.iupac_name
# add molecular mass
sl["Molecular mass"][index] = c.molecular_weight
# add monosiotopic mass
sl["Monoisotopic_mass"][index] = c.monoisotopic_mass
# add formula
sl["Formula"][index] = c.molecular_formula
# add synonyms
sl["synonyms"][index] = c.synonyms
#what alyterations are made? we checked and added information to these entries which have correct SMILES
sl["PubChemPY"][index]= "checked comp.properties"
else:
# now it means there was no compound in Pubchem with these SMILES
sl["PubChemPY"][index]= "no entry in PubChem"
# If there is an error while running PubChempy, save it in a dataframe
except Exception as e:
no_entries_pb.append({
"i": index,
"error": e,
"smiles":sm
})
no_entries = pd.DataFrame(no_entries_pb)
sl
"""We deal with ``` no_entries``` dataframe later; this contains wrong SMILES or PubChempy error we can check later. Lets first focus on entries with no SMILES in the suspect list, we can use the names to check the pubchem and add relevant information
"""
#define another empty variable for entries in suspect list with no smiles:
no_smiles_sl = []
# for each row in the suspect list
for index, row in sl.iterrows():
# if SMILES cell is empty
if isNaN(sl["SMILES"][index]):
# add no smiles in sl
sl["PubChemPY"][index]= "no smiles in sl"
# save the name and index and check these entries
no_smiles_sl.append({
"i": index,
"names":sl["Name"][index]
})
no_smiles = pd.DataFrame(no_smiles_sl)
no_smiles
"""Above you can see a nitrite reductase. For that you can manually check uniprot and see what are the reactants and products for this enzyme and put nitrute reductase in the enzyme section"""
# define empty variable for entries which give errors
no_entries_nm = []
# for each row of no_smiles,
for index, rows in no_smiles.iterrows():
# for each row in suspect list
for indices, rows in sl.iterrows():
# if the "i", which is the index number in suspect list for these compound names is same as the index number in the suspect list
# and if there is no "ase" in the name of the compound, (becauase "ase" means its an enzyme)
if indices == no_smiles["i"][index] and "ase" not in no_smiles["names"][index]:
# save the name of the entry
nameSL = (sl["Name"][indices])
try:
# check if the name is present in the PubChem
n_comp = pcp.get_compounds(nameSL, 'name')
# if the compound with the name is present
if n_comp:
# add following information to the suspect list
for c in n_comp:
sl["PubChemId"][indices] = c.cid
sl["InChI"][indices]= c.inchi
sl["iupac"][indices]= c.iupac_name
sl["SMILES"][indices] = c.isomeric_smiles
sl["Molecular mass"][indices] = c.molecular_weight
sl["Monoisotopic_mass"][indices] = c.monoisotopic_mass
sl["Formula"][indices] = c.molecular_formula
sl["synonyms"][indices] = c.synonyms
sl["PubChemPY"][indices]= "checked comp.properties"
# otherwise, no name was found in the suspect list
else:
sl["PubChemPY"][indices]= "no NAME in PubChem"
# in case there is error while running Pubchempy, add this information into a dataframe
except Exception as e:
no_entries_pb.append({
"i": indices,
"error": e,
"names":nameSL
})
no_entriesNMS = pd.DataFrame(no_entries_nm)
# it is empty, which means no errors with names
no_entriesNMS
# going back to the errors with the SMILES
no_entries
# we keep the first compound as it is and we dont change 35 and 36 compounds.
# the SMILES for 35 and 36 contain * which means wild card in SMILES which symbolizes an unknown atom of any element
"""## classification
Lets add classification to the compounds. ChemONT is a chemical classification ontology that is used by a tool called ClassyFire. We can use ClassyFire via webservice or even in python like we will do for this notebook.
"""
# define an empty variable to store inchi, inchikey and smiles in a dataframe
inchis = []
# here we store sl as frame
frame = sl
# so for each entry in the suspect list now known as frame
for i, row in frame.iterrows():
# if there is a smiles entry
if not isNaN(frame['SMILES'][i]):
try:
#convert smiles to inchi and then inchi to inchi key, using rdkit
InChI = Chem.MolToInchi(Chem.MolFromSmiles(frame["SMILES"][i]))
InChIKey = Chem.inchi.InchiToInchiKey(InChI)
# save these entries into inchis matrix which later converted to dataframe called inchis as well
inchis.append({
'index': i,
'smiles':frame["SMILES"][i],
'inchi': InChI,
'inchikey': InChIKey
})
except:
pass
inchis = pd.DataFrame(inchis)
# there are entries in the inchis dataframe
if len(inchis):
#take all entries for which an inchi is present
inchis = inchis.loc[-isNaN(inchis['inchikey'])]
## Retrieve ClassyFire classifications ##
# This first step is done using inchikey and interrogation of the gnps classified structures
gnps_proxy = True
url = "http://classyfire.wishartlab.com"
proxy_url = "https://gnps-classyfire.ucsd.edu"
chunk_size = 1000
sleep_interval = 12
all_inchi_keys = list(inchis['inchikey'].drop_duplicates())
resolved_ik_number_list = [0, 0]
total_inchikey_number = len(all_inchi_keys)
while True:
get_classifications_cf_mod(all_inchi_keys, par_level = 6)
cleanse('all_json.json', 'all_json.json')
with open("all_json.json") as tweetfile:
jsondic = json.loads(tweetfile.read())
df = json_normalize(jsondic)
df = df.drop_duplicates( 'inchikey' )
resolved_ik_number = len( df.drop_duplicates('inchikey').inchikey )
resolved_ik_number_list.append( resolved_ik_number )
if resolved_ik_number_list[-1] < resolved_ik_number_list[-2] or resolved_ik_number_list[-1] == resolved_ik_number_list[-3]:
break
cleanse('all_json.json', 'all_json_cleaned.json')
with open("all_json_cleaned.json") as tweetfile:
jsondic = json.loads(tweetfile.read())
flattened_classified_json = json_normalize(jsondic)
flattened_df = flattened_classified_json.drop_duplicates('inchikey')
flattened_df['inchikey'] = flattened_df['inchikey'].str.replace(r'InChIKey=', '')
df_merged = pd.merge(inchis, flattened_df, left_on='inchikey', right_on='inchikey', how='left')
for p, rowp in df_merged.iterrows():
for q, rowq in frame.iterrows():
if df_merged["smiles_x"][p] is frame["SMILES"][q]:
frame.loc[q, 'sub_class'] = df_merged["subclass.name"][p]
frame.loc[q, 'class'] = df_merged["class.name"][p]
frame.loc[q, 'super_class'] = df_merged["superclass.name"][p]
frame.loc[q, 'Classification_Source'] = "ClassyFire"