-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAnalysis.py
145 lines (115 loc) · 6.19 KB
/
Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, precision_score, recall_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import XGBClassifier
# def get_rr_intervals(r_peak_vector):
# rr_intervals_vec = r_peak_vector[1:]
# rr_intervals_vec -= r_peak_vector[:-1]
# return rr_intervals_vec
def data_preparation(data_df):
patients = data_df['patient'].unique()
patients_train, patients_test = train_test_split(patients, test_size=0.3) # , random_state = 42
algorithms = data_df['algorithm'].unique()
channels = data_df['channel'].unique()
# each record is splitted to 30*3 slices (each slice is 20-sec (1 minute/3) data)
num_windows = data_df['window_num'].max()
# determine the length of vector for each alg.
indices_of_patients_train_in_df = data_df[data_df['patient'].isin(patients_train)].index
mean_value = data_df.loc[indices_of_patients_train_in_df, 'rr_intervals_vec_length'].mean()
std_value = data_df.loc[indices_of_patients_train_in_df, 'rr_intervals_vec_length'].std()
vec_length = int(mean_value) + int(std_value)
# make empty (zero filled) matrices/vectors for data
num_rows_train = len(channels) * num_windows * len(patients_train)
num_rows_test = len(channels) * num_windows * len(patients_test)
num_columns = vec_length * len(algorithms)
X_train, X_test, y_train, y_test = (np.zeros((num_rows_train, num_columns)), np.zeros((num_rows_test, num_columns)),
np.zeros(num_rows_train), np.zeros(num_rows_test))
tr_i = 0
te_i = 0
for channel in channels:
for window in range(num_windows):
# Train set
for patient in patients_train:
X_train[tr_i, :], y_train[tr_i] = (
get_one_record_based_on_sorted_rr_intervals(data_df, vec_length, channel, window, patient))
tr_i += 1
# Test set
for patient in patients_test:
X_test[te_i, :], y_test[te_i] = (
get_one_record_based_on_sorted_rr_intervals(data_df, vec_length, channel, window, patient))
te_i += 1
return X_train, X_test, y_train, y_test
def get_one_record_based_on_sorted_rr_intervals(data_df, vec_length, channel, window, patient):
# make one record here
# for algorithm in algorithms:
# pass
# indices = data_df.query('channel == 0 & window_num == 0 & patient == 209').index
indices = data_df.query(f'channel == {channel} & window_num == {window} & patient == {patient}').index
indices = np.sort(indices)
num_algorithms = len(indices)
x_vector = np.zeros(vec_length * num_algorithms)
noise_label = data_df.loc[indices[0], 'noise_label']
for index_i, index in enumerate(indices):
try:
sorted_rr_intervals = np.sort(data_df.loc[index, 'rr_intervals'])[::-1]
except np.exceptions.AxisError:
pass
# print("numpy.exceptions.AxisError") # for tqdm
# print(data_df.loc[index, 'algorithm'])
# print(data_df.loc[index, 'rr_intervals'])
try:
sorted_rr_intervals_length = len(sorted_rr_intervals)
except UnboundLocalError:
sorted_rr_intervals_length = 0
if vec_length <= sorted_rr_intervals_length:
temp = sorted_rr_intervals[:vec_length]
else:
temp = np.zeros(vec_length)
if sorted_rr_intervals_length > 0:
temp[:sorted_rr_intervals_length] = sorted_rr_intervals
start_i = index_i * vec_length
end_i = (index_i + 1) * vec_length
x_vector[start_i:end_i] = temp
return x_vector, noise_label
def build_ml_model(X_train, y_train):
# train
# clf_ert = ExtraTreesClassifier(n_estimators=100).fit(X_train, y_train)
clf_rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
# clf_xgb = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train)
# clf_gb = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train)
# clf_xgb = XGBClassifier(n_jobs=1).fit(X_train, y_train)
return clf_rf
def build_several_ml_model(X_train, y_train):
# train
clf_ert = ExtraTreesClassifier(n_estimators=100).fit(X_train, y_train)
clf_rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
# clf_xgb = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train)
clf_gb = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train)
clf_xgb = XGBClassifier(n_jobs=1).fit(X_train, y_train)
clf_dt = DecisionTreeClassifier().fit(X_train, y_train)
clf_ls = make_pipeline(StandardScaler(), LinearSVC()).fit(X_train, y_train)
clf_rbfsvm = make_pipeline(StandardScaler(), SVC()).fit(X_train, y_train)
return clf_ert, clf_rf, clf_xgb, clf_dt, clf_ls, clf_rbfsvm
def evaluate(model, X_test, y_test, print_flag=False):
# predict
y_pred = model.predict(X_test)
y_true = y_test
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
f1_performance = f1_score(y_true, y_pred) # , average='weighted'
acc_performance = accuracy_score(y_true, y_pred)
precision_performance = precision_score(y_true, y_pred)
recall_performance = recall_score(y_true, y_pred)
if print_flag:
print("tn, fp, fn, tp: ", tn, fp, fn, tp)
print("f1_performance: ", f1_performance)
print("acc_performance", acc_performance)
print("precision_performance", precision_performance)
print("recall_performance", recall_performance)
print("\nclassification_report")
print(classification_report(y_true, y_pred, target_names=['non noise', 'noise']))
return f1_performance, acc_performance, precision_performance, recall_performance