-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimulation_5cv.py
112 lines (93 loc) · 3.42 KB
/
simulation_5cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
=================================================================
Simulation Study Comparing Random Boosting with Gradient Boosting
=================================================================
Author: Tobias Krabel
Compares Friedman (2001)'s standard Gradient Boosting framework with
Random Tree Depth Injection. In this simulation, both models are tuned
on the same grid using 5-fold CV.
"""
from random_boost.random_boost import RandomBoostingRegressor, RandomBoostingClassifier
from random_boost.utils import gen_friedman_data
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np
import time
import datetime
# Homegrown
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
def mae(y_true, y_pred):
return mean_absolute_error(y_true, y_pred)
# Constants
# General
SEED = 10101010
N_SAMPLES = 20000
N_INPUTS = 10
N_COMPONENTS = 20
SIGNAL_TO_NOISE = 1.0
N_NOISE = 5
N_ROUNDS = 100
PARAMS = {
'learning_rate': [0.1],
'max_depth': [_ for _ in range(2, 9)],
'n_estimators': [_ for _ in range(100, 200, 5)]
}
# Set seed
np.random.seed(SEED)
# Results
df_result = pd.DataFrame(columns=['run', 'model', 'rmse', 'mae', 'time_sec'])
START = time.time()
for i in range(N_ROUNDS):
print(f'Round #{i+1} of {N_ROUNDS}')
# Data
print('... generate Friedman data')
X, y = gen_friedman_data(n_samples=N_SAMPLES,
n_inputs=N_INPUTS,
n_components=N_COMPONENTS,
n_noise=N_NOISE,
stn=SIGNAL_TO_NOISE)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# Fit Random Boost
print(f'... fit Random Boost')
start_time = time.time()
# Little trick: inheritance doesn't seem to work properly
model = RandomBoostingRegressor()
rb = GridSearchCV(model, param_grid=PARAMS, cv=5, n_jobs=7)
rb = rb.fit(X_train, y_train)
time_rb = time.time() - start_time
print(f'... took {time_rb} seconds')
# Fit MART
print(f'... fit MART')
start_time = time.time()
model = GradientBoostingRegressor()
gb = GridSearchCV(model, param_grid=PARAMS, cv=5, n_jobs=7)
gb = gb.fit(X_train, y_train)
time_gb = time.time() - start_time
print(f'... took {time_gb} seconds\n')
# Add Results to Container
models = [rb, gb]
df_result = pd.concat([
df_result,
pd.DataFrame(
data={
'run':([i+1] * 2),
'model':['rb', 'xgb'],
'rmse':[rmse(y_test, model.predict(X_test)) for model in models],
'mae':[mae(y_test, model.predict(X_test)) for model in models],
'time_sec':[time_rb, time_gb]
})],
axis=0,
ignore_index=True
)
# Save to file
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
str_lr = ",".join([str(_) for _ in PARAMS["learning_rate"]])
str_md = ",".join([str(_) for _ in PARAMS["max_depth"]])
str_ne = ",".join([str(_) for _ in PARAMS["n_estimators"]])
df_result.to_csv(f'data/simulation_results/rb-vs-xgb-lr{str_lr}-d{str_md}-nest{str_ne}-5cv-{now}.csv',
index=False)
SECS = time.time() - START
print(f'Simulation took {SECS / N_ROUNDS} seconds per round ({SECS} in total)')