forked from PaddlePaddle/PaddleHelix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
featurizer.py
352 lines (305 loc) · 14 KB
/
featurizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# Adapted from https://gitlab.com/cheminfIBB/tfbio/-/blob/master/tfbio/data.py
import pickle
import numpy as np
from openbabel import pybel
class Featurizer():
"""Calcaulates atomic features for molecules. Features can encode atom type,
native pybel properties or any property defined with SMARTS patterns
Attributes
----------
FEATURE_NAMES: list of strings
Labels for features (in the same order as features)
NUM_ATOM_CLASSES: int
Number of atom codes
ATOM_CODES: dict
Dictionary mapping atomic numbers to codes
NAMED_PROPS: list of string
Names of atomic properties to retrieve from pybel.Atom object
CALLABLES: list of callables
Callables used to calculcate custom atomic properties
SMARTS: list of SMARTS strings
SMARTS patterns defining additional atomic properties
"""
def __init__(self, atom_codes=None, atom_labels=None,
named_properties=None, save_molecule_codes=True,
custom_properties=None, smarts_properties=None,
smarts_labels=None):
"""Creates Featurizer with specified types of features. Elements of a
feature vector will be in a following order: atom type encoding
(defined by atom_codes), Pybel atomic properties (defined by
named_properties), molecule code (if present), custom atomic properties
(defined `custom_properties`), and additional properties defined with
SMARTS (defined with `smarts_properties`).
Parameters
----------
atom_codes: dict, optional
Dictionary mapping atomic numbers to codes. It will be used for
one-hot encoging therefore if n different types are used, codes
shpuld be from 0 to n-1. Multiple atoms can have the same code,
e.g. you can use {6: 0, 7: 1, 8: 1} to encode carbons with [1, 0]
and nitrogens and oxygens with [0, 1] vectors. If not provided,
default encoding is used.
atom_labels: list of strings, optional
Labels for atoms codes. It should have the same length as the
number of used codes, e.g. for `atom_codes={6: 0, 7: 1, 8: 1}` you
should provide something like ['C', 'O or N']. If not specified
labels 'atom0', 'atom1' etc are used. If `atom_codes` is not
specified this argument is ignored.
named_properties: list of strings, optional
Names of atomic properties to retrieve from pybel.Atom object. If
not specified ['hyb', 'heavyvalence', 'heterovalence',
'partialcharge'] is used.
save_molecule_codes: bool, optional (default True)
If set to True, there will be an additional feature to save
molecule code. It is usefeul when saving molecular complex in a
single array.
custom_properties: list of callables, optional
Custom functions to calculate atomic properties. Each element of
this list should be a callable that takes pybel.Atom object and
returns a float. If callable has `__name__` property it is used as
feature label. Otherwise labels 'func<i>' etc are used, where i is
the index in `custom_properties` list.
smarts_properties: list of strings, optional
Additional atomic properties defined with SMARTS patterns. These
patterns should match a single atom. If not specified, deafult
patterns are used.
smarts_labels: list of strings, optional
Labels for properties defined with SMARTS. Should have the same
length as `smarts_properties`. If not specified labels 'smarts0',
'smarts1' etc are used. If `smarts_properties` is not specified
this argument is ignored.
"""
# Remember namse of all features in the correct order
self.FEATURE_NAMES = []
if atom_codes is not None:
if not isinstance(atom_codes, dict):
raise TypeError('Atom codes should be dict, got %s instead'
% type(atom_codes))
codes = set(atom_codes.values())
for i in range(len(codes)):
if i not in codes:
raise ValueError('Incorrect atom code %s' % i)
self.NUM_ATOM_CLASSES = len(codes)
self.ATOM_CODES = atom_codes
if atom_labels is not None:
if len(atom_labels) != self.NUM_ATOM_CLASSES:
raise ValueError('Incorrect number of atom labels: '
'%s instead of %s'
% (len(atom_labels), self.NUM_ATOM_CLASSES))
else:
atom_labels = ['atom%s' % i for i in range(self.NUM_ATOM_CLASSES)]
self.FEATURE_NAMES += atom_labels
else:
self.ATOM_CODES = {}
metals = ([3, 4, 11, 12, 13] + list(range(19, 32))
+ list(range(37, 51)) + list(range(55, 84))
+ list(range(87, 104)))
# List of tuples (atomic_num, class_name) with atom types to encode.
atom_classes = [
(5, 'B'),
(6, 'C'),
(7, 'N'),
(8, 'O'),
(15, 'P'),
(16, 'S'),
(34, 'Se'),
([9, 17, 35, 53], 'halogen'),
(metals, 'metal')
]
for code, (atom, name) in enumerate(atom_classes):
if type(atom) is list:
for a in atom:
self.ATOM_CODES[a] = code
else:
self.ATOM_CODES[atom] = code
self.FEATURE_NAMES.append(name)
self.NUM_ATOM_CLASSES = len(atom_classes)
if named_properties is not None:
if not isinstance(named_properties, (list, tuple, np.ndarray)):
raise TypeError('named_properties must be a list')
allowed_props = [prop for prop in dir(pybel.Atom)
if not prop.startswith('__')]
for prop_id, prop in enumerate(named_properties):
if prop not in allowed_props:
raise ValueError(
'named_properties must be in pybel.Atom attributes,'
' %s was given at position %s' % (prop_id, prop)
)
self.NAMED_PROPS = named_properties
else:
# pybel.Atom properties to save
self.NAMED_PROPS = ['hyb', 'heavydegree', 'heterodegree',
'partialcharge']
self.FEATURE_NAMES += self.NAMED_PROPS
if not isinstance(save_molecule_codes, bool):
raise TypeError('save_molecule_codes should be bool, got %s '
'instead' % type(save_molecule_codes))
self.save_molecule_codes = save_molecule_codes
if save_molecule_codes:
# Remember if an atom belongs to the ligand or to the protein
self.FEATURE_NAMES.append('molcode')
self.CALLABLES = []
if custom_properties is not None:
for i, func in enumerate(custom_properties):
if not callable(func):
raise TypeError('custom_properties should be list of'
' callables, got %s instead' % type(func))
name = getattr(func, '__name__', '')
if name == '':
name = 'func%s' % i
self.CALLABLES.append(func)
self.FEATURE_NAMES.append(name)
if smarts_properties is None:
# SMARTS definition for other properties
self.SMARTS = [
'[#6+0!$(*~[#7,#8,F]),SH0+0v2,s+0,S^3,Cl+0,Br+0,I+0]',
'[a]',
'[!$([#1,#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]',
'[!$([#6,H0,-,-2,-3]),$([!H0;#7,#8,#9])]',
'[r]'
]
smarts_labels = ['hydrophobic', 'aromatic', 'acceptor', 'donor',
'ring']
elif not isinstance(smarts_properties, (list, tuple, np.ndarray)):
raise TypeError('smarts_properties must be a list')
else:
self.SMARTS = smarts_properties
if smarts_labels is not None:
if len(smarts_labels) != len(self.SMARTS):
raise ValueError('Incorrect number of SMARTS labels: %s'
' instead of %s'
% (len(smarts_labels), len(self.SMARTS)))
else:
smarts_labels = ['smarts%s' % i for i in range(len(self.SMARTS))]
# Compile patterns
self.compile_smarts()
self.FEATURE_NAMES += smarts_labels
def compile_smarts(self):
self.__PATTERNS = []
for smarts in self.SMARTS:
self.__PATTERNS.append(pybel.Smarts(smarts))
def encode_num(self, atomic_num):
"""Encode atom type with a binary vector. If atom type is not included in
the `atom_classes`, its encoding is an all-zeros vector.
Parameters
----------
atomic_num: int
Atomic number
Returns
-------
encoding: np.ndarray
Binary vector encoding atom type (one-hot or null).
"""
if not isinstance(atomic_num, int):
raise TypeError('Atomic number must be int, %s was given'
% type(atomic_num))
encoding = np.zeros(self.NUM_ATOM_CLASSES)
try:
encoding[self.ATOM_CODES[atomic_num]] = 1.0
except:
pass
return encoding
def find_smarts(self, molecule):
"""Find atoms that match SMARTS patterns.
Parameters
----------
molecule: pybel.Molecule
Returns
-------
features: np.ndarray
NxM binary array, where N is the number of atoms in the `molecule`
and M is the number of patterns. `features[i, j]` == 1.0 if i'th
atom has j'th property
"""
if not isinstance(molecule, pybel.Molecule):
raise TypeError('molecule must be pybel.Molecule object, %s was given'
% type(molecule))
features = np.zeros((len(molecule.atoms), len(self.__PATTERNS)))
for (pattern_id, pattern) in enumerate(self.__PATTERNS):
atoms_with_prop = np.array(list(*zip(*pattern.findall(molecule))),
dtype=int) - 1
features[atoms_with_prop, pattern_id] = 1.0
return features
def get_features(self, molecule, molcode=None):
"""Get coordinates and features for all heavy atoms in the molecule.
Parameters
----------
molecule: pybel.Molecule
molcode: float, optional
Molecule type. You can use it to encode whether an atom belongs to
the ligand (1.0) or to the protein (-1.0) etc.
Returns
-------
coords: np.ndarray, shape = (N, 3)
Coordinates of all heavy atoms in the `molecule`.
features: np.ndarray, shape = (N, F)
Features of all heavy atoms in the `molecule`: atom type
(one-hot encoding), pybel.Atom attributes, type of a molecule
(e.g protein/ligand distinction), and other properties defined with
SMARTS patterns
"""
if not isinstance(molecule, pybel.Molecule):
raise TypeError('molecule must be pybel.Molecule object,'
' %s was given' % type(molecule))
if molcode is None:
if self.save_molecule_codes is True:
raise ValueError('save_molecule_codes is set to True,'
' you must specify code for the molecule')
elif not isinstance(molcode, (float, int)):
raise TypeError('motlype must be float, %s was given'
% type(molcode))
coords = []
features = []
heavy_atoms = []
for i, atom in enumerate(molecule):
# ignore hydrogens and dummy atoms (they have atomicnum set to 0)
if atom.atomicnum > 1:
heavy_atoms.append(i)
coords.append(atom.coords)
features.append(np.concatenate((
self.encode_num(atom.atomicnum),
[atom.__getattribute__(prop) for prop in self.NAMED_PROPS],
[func(atom) for func in self.CALLABLES],
)))
coords = np.array(coords, dtype=np.float32)
features = np.array(features, dtype=np.float32)
if self.save_molecule_codes:
features = np.hstack((features,
molcode * np.ones((len(features), 1))))
features = np.hstack([features,
self.find_smarts(molecule)[heavy_atoms]])
if np.isnan(features).any():
raise RuntimeError('Got NaN when calculating features')
return coords, features
def to_pickle(self, fname='featurizer.pkl'):
"""Save featurizer in a given file. Featurizer can be restored with
`from_pickle` method.
Parameters
----------
fname: str, optional
Path to file in which featurizer will be saved
"""
# patterns can't be pickled, we need to temporarily remove them
patterns = self.__PATTERNS[:]
del self.__PATTERNS
try:
with open(fname, 'wb') as f:
pickle.dump(self, f)
finally:
self.__PATTERNS = patterns[:]
@staticmethod
def from_pickle(fname):
"""Load pickled featurizer from a given file
Parameters
----------
fname: str, optional
Path to file with saved featurizer
Returns
-------
featurizer: Featurizer object
Loaded featurizer
"""
with open(fname, 'rb') as f:
featurizer = pickle.load(f)
featurizer.compile_smarts()
return featurizer