-
Notifications
You must be signed in to change notification settings - Fork 19
/
demo_opt_gtmr_with_cv_multi_y_descriptors.py
134 lines (118 loc) · 6.49 KB
/
demo_opt_gtmr_with_cv_multi_y_descriptors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
# %reset -f
"""
@author: Hiromasa Kaneko
"""
# Demonstration of GTMR (Generative Topographic Mapping Regression) with multiple y-variables
import matplotlib.figure as figure
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dcekit.generative_model import GTM
from sklearn.model_selection import train_test_split
# settings
deleting_descriptor_names = ['Ipc', 'Kappa3']
# deleting_descriptor_names = []
fold_number = 2
candidates_of_shape_of_map = np.arange(30, 31, dtype=int)
candidates_of_shape_of_rbf_centers = np.arange(2, 22, 2, dtype=int)
candidates_of_variance_of_rbfs = 2 ** np.arange(-5, 4, 2, dtype=float)
candidates_of_lambda_in_em_algorithm = 2 ** np.arange(-4, 0, dtype=float)
# candidates_of_shape_of_rbf_centers = np.arange(2, 18, 2, dtype=int)
# candidates_of_variance_of_rbfs = 2 ** np.arange(-7, 2, 2, dtype=float)
# candidates_of_lambda_in_em_algorithm = 2 ** np.arange(-4, -1, dtype=float)
candidates_of_lambda_in_em_algorithm = np.append(0, candidates_of_lambda_in_em_algorithm)
number_of_iterations = 200
display_flag = True
random_state_number = 30000
number_of_test_samples = 600
numbers_of_y = [0, 1, 2]
# load dataset
variables = pd.read_csv('descriptors_with_multi_y.csv', index_col=0)
if deleting_descriptor_names is not None:
variables = variables.drop(deleting_descriptor_names, axis=1)
variables_train, variables_test = train_test_split(variables, test_size=number_of_test_samples, random_state=100)
# variables_train, variables_test = train_test_split(variables, test_size=number_of_test_samples, shuffle=False)
variables_train = variables_train.replace(np.inf, np.nan).fillna(np.nan) # inf を NaN に置き換え
nan_variable_flags = variables_train.isnull().any() # NaN を含む変数
numbers_of_x = np.arange(variables_train.shape[1])
numbers_of_x = np.delete(numbers_of_x, numbers_of_y)
variables_train = variables_train.drop(variables_train.columns[nan_variable_flags], axis=1) # NaN を含む変数を削除
variables_test = variables_test.drop(variables_test.columns[nan_variable_flags], axis=1) # NaN を含む変数を削除
# 標準偏差が 0 の説明変数を削除
std_0_variable_flags = variables_train.std() == 0
variables_train = variables_train.drop(variables_train.columns[std_0_variable_flags], axis=1)
variables_test = variables_test.drop(variables_test.columns[std_0_variable_flags], axis=1)
numbers_of_x = np.arange(numbers_of_y[-1] + 1, variables_train.shape[1])
# standardize x and y
autoscaled_variables_train = (variables_train - variables_train.mean(axis=0)) / variables_train.std(axis=0, ddof=1)
autoscaled_variables_test = (variables_test - variables_train.mean(axis=0)) / variables_train.std(axis=0, ddof=1)
# optimize hyperparameter in GTMR with CV
model = GTM()
model.cv_opt(autoscaled_variables_train, numbers_of_x, numbers_of_y, candidates_of_shape_of_map,
candidates_of_shape_of_rbf_centers,
candidates_of_variance_of_rbfs, candidates_of_lambda_in_em_algorithm, fold_number,
number_of_iterations)
model.display_flag = display_flag
print('optimized shape of map :', model.shape_of_map)
print('optimized shape of RBF centers :', model.shape_of_rbf_centers)
print('optimized variance of RBFs :', model.variance_of_rbfs)
print('optimized lambda in EM algorithm :', model.lambda_in_em_algorithm)
# construct GTMR model
model.fit(autoscaled_variables_train)
if model.success_flag:
# calculate of responsibilities
responsibilities = model.responsibility(autoscaled_variables_train)
means, modes = model.means_modes(autoscaled_variables_train)
plt.rcParams['font.size'] = 18
for y_number in numbers_of_y:
# plot the mean of responsibilities
plt.scatter(means[:, 0], means[:, 1], c=variables_train.iloc[:, y_number])
plt.colorbar()
plt.ylim(-1.1, 1.1)
plt.xlim(-1.1, 1.1)
plt.xlabel('z1 (mean)')
plt.ylabel('z2 (mean)')
plt.show()
# plot the mode of responsibilities
plt.scatter(modes[:, 0], modes[:, 1], c=variables_train.iloc[:, y_number])
plt.colorbar()
plt.ylim(-1.1, 1.1)
plt.xlim(-1.1, 1.1)
plt.xlabel('z1 (mode)')
plt.ylabel('z2 (mode)')
plt.show()
# GTMR prediction
# Forward analysis (Regression)
mean_of_estimated_mean_of_y, mode_of_estimated_mean_of_y, responsibilities_y, py = \
model.predict(autoscaled_variables_test.iloc[:, numbers_of_x], numbers_of_x, numbers_of_y)
# Inverse analysis
mean_of_estimated_mean_of_x, mode_of_estimated_mean_of_x, responsibilities_y, py = \
model.predict(autoscaled_variables_test.iloc[:, numbers_of_y], numbers_of_y, numbers_of_x)
# Check results of forward analysis (regression)
print('Results of forward analysis (regression)')
predicted_y_test_all = mode_of_estimated_mean_of_y.copy()
plt.rcParams['font.size'] = 18
for index, y_number in enumerate(numbers_of_y):
predicted_y_test = np.ndarray.flatten(predicted_y_test_all[:, index])
predicted_y_test = predicted_y_test * variables_train.iloc[:, y_number].std(ddof=1) + variables_train.iloc[:, y_number].mean()
# yy-plot
plt.figure(figsize=figure.figaspect(1))
plt.scatter(variables_test.iloc[:, y_number], predicted_y_test)
YMax = np.max(np.array([np.array(variables_test.iloc[:, y_number]), predicted_y_test]))
YMin = np.min(np.array([np.array(variables_test.iloc[:, y_number]), predicted_y_test]))
plt.plot([YMin - 0.05 * (YMax - YMin), YMax + 0.05 * (YMax - YMin)],
[YMin - 0.05 * (YMax - YMin), YMax + 0.05 * (YMax - YMin)], 'k-')
plt.ylim(YMin - 0.05 * (YMax - YMin), YMax + 0.05 * (YMax - YMin))
plt.xlim(YMin - 0.05 * (YMax - YMin), YMax + 0.05 * (YMax - YMin))
plt.xlabel('Actual Y')
plt.ylabel('Estimated Y')
plt.show()
# r2p, RMSEp, MAEp
print(
'r2p: {0}'.format(float(1 - sum((variables_test.iloc[:, y_number] - predicted_y_test) ** 2) / sum(
(variables_test.iloc[:, y_number] - variables_test.iloc[:, y_number].mean()) ** 2))))
print('RMSEp: {0}'.format(float((sum((variables_test.iloc[:, y_number] - predicted_y_test) ** 2) / len(
variables_test.iloc[:, y_number])) ** 0.5)))
print('MAEp: {0}'.format(float(sum(abs(variables_test.iloc[:, y_number] - predicted_y_test)) / len(
variables_test.iloc[:, y_number]))))