forked from h1st-ai/h1st
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgradient_boosting_msg_classifier.py
67 lines (60 loc) · 2.78 KB
/
gradient_boosting_msg_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import h1st as h1
import pandas as pd
import config
import util
FEATURES = config.SENSORS + ["%s_TimeDiff" % s for s in config.SENSORS]
class GradientBoostingMsgClassifierModel(h1.Model):
def load_data(self, num_files=None):
return util.load_data(num_files, shuffle=False)
def prep(self, data):
def concat_processed_files(files):
dfs = []
for f in files:
z = pd.read_parquet(f)
z = util.compute_timediff_fillna(z, dropna_subset=FEATURES)
dfs.append(z)
df2 = pd.concat(dfs)
return df2
split = int(len(data["attack_files"])*0.5)
train_files = data["attack_files"][:split]
test_files = data["attack_files"][split:]
result = {
"train_files": train_files,
"test_files": test_files,
"train_attack_df": concat_processed_files(train_files),
"test_attack_df": concat_processed_files(test_files)
}
print("len train_attack_df = %s" % len(result["train_attack_df"]))
print("len test_attack_df = %s" % len(result["test_attack_df"]))
return result
def train(self, prepared_data):
df = prepared_data["train_attack_df"]
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
X = df[FEATURES]
y = df.Label == config.ATTACK_LABEL
self.model = HistGradientBoostingClassifier(max_iter=500).fit(X, y)
def evaluate(self, prepared_data):
df = prepared_data["test_attack_df"]
ypred = self.model.predict(df[FEATURES])
import sklearn.metrics
cf = sklearn.metrics.confusion_matrix(df.Label == config.ATTACK_LABEL, ypred)
acc = sklearn.metrics.accuracy_score(df.Label == config.ATTACK_LABEL, ypred)
print(cf)
print("Accuracy = %.4f" % acc)
self.metrics = {"confusion_matrix": cf, "accuracy": acc}
def predict(self, data):
df = data["df"].copy()
df = util.compute_timediff_fillna(df)
df['MsgIsAttack'] = 0
df['WindowInAttack'] = 0
for event_result in data["event_detection_results"]:
if event_result['WindowInAttack']:
# print("window %s in attack: event_result = %s" % (event_result['window_start'], event_result))
in_window = (df.Timestamp >= event_result['window_start']) & (df.Timestamp < event_result['window_start'] + config.WINDOW_SIZE)
w_df = df[in_window]
if len(w_df) > 0:
ypred = self.model.predict(w_df[FEATURES])
df.loc[in_window, "WindowInAttack"] = 1
df.loc[in_window, "MsgIsAttack"] = ypred.astype(int)
return {"injection_window_results": df}