-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsclerosis_multiple.py
184 lines (152 loc) · 8 KB
/
sclerosis_multiple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import torch
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from torch import nn, optim
from torch.utils.data import DataLoader
from common import utils, fit_scaler, results_to_csv, DataBuilder
import numpy as np
import pandas as pd
from common.metrics import calculate_eer
from discrimination.discrimination_utils import load_data, roc_auc_score_multiclass, check_model_used, load_joint_embs, \
load_baseline_feats
from discrimination.svm_utils import train_svm
from common.dimension_reduction import ReduceDims, Autoencoder, train, weights_init_uniform_rule, \
VariationalAutoencoder, CustomLoss, train_vae
config = utils.load_config('config/config_depression.yml') # loading configuration
# config_bea = utils.load_config('config/config_bea16k.yml') # loading configuration for bea dataset (PCA, std)
shuffle_data = config['shuffle_data'] # whether to shuffle the training data
label_file = config['paths']['to_labels'] # path to the labels of the dataset
output_results = config['paths']['output_results'] # path to csv for saving the results
emb_type = config['discrimination']['emb_type']
checkpoint_path = config['pretrained_model_details']['checkpoint_path']
model_used = check_model_used(checkpoint_path)
# data = load_baseline_feats(path='data/10_narrative_recall/x-vectors/xvecs-10_narrative_recall-sre16-mfcc23-aug.txt', delimiter=None)
# data = load_baseline_feats(path='data/10_narrative_recall/compare/features.compare2016.rr45.10_narrative_recall.txt',
# delimiter=',')
if config['feature_combination']:
data = load_joint_embs(config=config)
else:
data = load_data(config=config) # loading data
# bea_train_flat = load_data(config=config_bea) # load bea embeddings
df_labels = pd.read_csv(label_file) # loading labels
data['label'] = df_labels.label.values # adding labels to data
# Shuffling data if needed
if shuffle_data:
data = data.sample(frac=1).reset_index(drop=True)
x_train, y_train = data.iloc[:, :-1].values, data.label.values # train and labels
# Standardizing data before reducing dimensions
# scaler_type = config_bea['data_scaling']['scaler_type']
scaler_type = config['data_scaling']['scaler_type']
if scaler_type != 'None':
scaler = fit_scaler(config, x_train)
# scaler = fit_scaler(config_bea, bea_train_flat)
# bea_train_flat = scaler.transform(x_train)
# bea_train_flat = scaler.transform(bea_train_flat)
x_train = scaler.transform(x_train)
print("Train data standardized...")
# Only modify this from the config file not here!
# dim_reduction = config_bea['dimension_reduction']['method'] # autoencoder
dim_reduction = config['dimension_reduction']['method'] # autoencoder
size_reduced = 'None' # new dimension size after reduction
n_epochs = 'None'
variance = 'None'
if dim_reduction == 'PCA':
# APPLY PCA!
# Train PCA model using embeddings got from bea-train-flat (57k files fo each emb type: convs and hiddens)
# Transform the dataset using the fitted PCA model
reduce_dims = ReduceDims(config_bea=config)
# if not scaler_type
# scaler = fit_scaler(config_bea, bea_train_flat)
# bea_train_flat = scaler.transform(bea_train_flat)
# pca = reduce_dims.fit_pca(bea_train_flat) # train PCA
pca = reduce_dims.fit_pca(x_train) # train PCA
x_train = pca.transform(x_train) # transform (reduce dimensionality)
print("New shape:", x_train.shape)
size_reduced = x_train.shape[1]
variance = config['dimension_reduction']['pca']['n_components']
elif dim_reduction == 'autoencoder':
print("\nReducing dimensions using a basic Autoencoder. Initial shape: {}".format(x_train.shape))
device = ('cuda' if torch.cuda.is_available() else 'cpu')
n_epochs = config['dimension_reduction']['autoencoder']['num_epochs']
enc_shape = config['dimension_reduction']['autoencoder']['encoder_size']
# bea_train_flat = torch.from_numpy(bea_train_flat).double().to(device)
x_train = torch.from_numpy(x_train).double().to(device)
# defining the autoencoder and training
encoder = Autoencoder(in_shape=x_train.shape[1], enc_shape=enc_shape).double().to(device)
error = nn.MSELoss()
optimizer = optim.Adam(encoder.parameters())
train(encoder, error, optimizer, n_epochs, x_train)
# reducing the dimensions
with torch.no_grad():
encoded = encoder.encode(x_train)
decoded = encoder.decode(encoded)
mse = error(decoded, x_train).item()
x_train = encoded.cpu().detach().numpy()
dec = decoded.cpu().detach().numpy()
size_reduced = x_train.shape[1]
print("New encoded shape:", x_train.shape)
elif dim_reduction == 'vae':
print("\nReducing dimensions using {0}. Initial shape: {1}".format(dim_reduction, x_train.shape))
device = ('cuda' if torch.cuda.is_available() else 'cpu')
n_epochs = config['dimension_reduction']['autoencoder']['num_epochs']
log_interval = 50
# converting data into dataloader (needed for training)
data_set = DataBuilder(bea_train_flat)
train_loader = DataLoader(dataset=data_set, batch_size=32)
train_data = DataBuilder(x_train)
x_loader = DataLoader(dataset=train_data, batch_size=32)
# define params
D_in = data_set.x.shape[1]
H = 50
H2 = 12
latent_dim = config['dimension_reduction']['autoencoder']['encoder_size'] # output size of the reduced embs
model = VariationalAutoencoder(D_in, latent_dim, H, H2).to(device)
model.apply(weights_init_uniform_rule)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_mse = CustomLoss()
# train
print("Training the Variational Autoencoder...")
for epoch in range(1, n_epochs+1):
model_trained = train_vae(model, train_loader, epoch, device, optimizer, loss_mse, config)
# Reducing dimensions of x_train
mu_output = []
with torch.no_grad():
for i, data in enumerate(x_loader):
data = data.to(device)
optimizer.zero_grad()
recon_batch, mu, logvar = model_trained(data)
mu_tensor = mu
mu_output.append(mu_tensor)
mu_result = torch.cat(mu_output, dim=0)
size_reduced = mu_result.shape[1]
print("New encoded shape:", mu_result.shape)
x_train = mu_result.detach().cpu().numpy()
else:
pass
# Train SVM
print("Using", config['discrimination']['emb_type'])
list_c = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1]
df = pd.DataFrame(columns=['c', 'acc', 'f1', 'prec', 'recall', 'auc', 'eer'])
for c in list_c:
# TRY ALSO MLP OR KNN!!!!
array_preds, array_trues, array_probs = train_svm(svm_type='linear-loocv', C=c, X=x_train, y=np.ravel(y_train))
# array_preds, array_trues, array_probs = train_svm(svm_type='rbf-loocv', C=c, X=x_train, y=y_train)
acc = accuracy_score(array_trues, array_preds)
# auc = roc_auc_score(array_trues, array_probs, labels=np.unique(y_train))
auc = roc_auc_score(array_trues, array_probs[:, 1])
# aucs = roc_auc_score_multiclass(actual_class=array_trues, pred_class=array_preds)
f1 = f1_score(array_trues, array_preds)
prec = precision_score(array_trues, array_preds)
rec = recall_score(array_trues, array_preds)
eer = calculate_eer(array_trues, array_preds)
# data = {'c': c, 'acc': acc, 'f1': f1, 'prec': prec, 'recall': rec, 'auc': auc}
dict_metrics = {'c': c, 'acc': acc, 'f1': f1, 'prec': prec, 'recall': rec, 'auc': auc, 'EER': eer, 'Embedding': emb_type,
'Reduction technique': '{0}-{1}'.format(dim_reduction, str(size_reduced)), 'Model used': model_used,
'std': str(scaler_type), 'n_epochs': n_epochs, 'variance': variance}
df = df.append(dict_metrics, ignore_index=True)
print("with", c, "acc:", acc, " f1:", f1, " prec:", prec, " recall:", rec, 'AUC:', auc, 'EER', eer)#, 'auc-c0:', aucs[1],
# 'auc-c1:', aucs[2])
# Saving results3
best_scores_df = df.iloc[[df['auc'].idxmax()]] # getting the best scores based on the highest AUC score.
# best_scores_df.to_csv(output_results, mode='a', header=not os.path.exists(output_results), index=False)
print(best_scores_df.values)