-
Notifications
You must be signed in to change notification settings - Fork 15
/
utils.py
79 lines (69 loc) · 1.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import random
import numpy as np
import torch
import dgl
import logging
CHARPROTSET = {
"A": 1,
"C": 2,
"B": 3,
"E": 4,
"D": 5,
"G": 6,
"F": 7,
"I": 8,
"H": 9,
"K": 10,
"M": 11,
"L": 12,
"O": 13,
"N": 14,
"Q": 15,
"P": 16,
"S": 17,
"R": 18,
"U": 19,
"T": 20,
"W": 21,
"V": 22,
"Y": 23,
"X": 24,
"Z": 25,
}
CHARPROTLEN = 25
def set_seed(seed=1000):
os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def graph_collate_func(x):
d, p, y = zip(*x)
d = dgl.batch(d)
return d, torch.tensor(np.array(p)), torch.tensor(y)
def mkdir(path):
path = path.strip()
path = path.rstrip("\\")
is_exists = os.path.exists(path)
if not is_exists:
os.makedirs(path)
def integer_label_protein(sequence, max_length=1200):
"""
Integer encoding for protein string sequence.
Args:
sequence (str): Protein string sequence.
max_length: Maximum encoding length of input protein string.
"""
encoding = np.zeros(max_length)
for idx, letter in enumerate(sequence[:max_length]):
try:
letter = letter.upper()
encoding[idx] = CHARPROTSET[letter]
except KeyError:
logging.warning(
f"character {letter} does not exists in sequence category encoding, skip and treat as " f"padding."
)
return encoding