-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.py
140 lines (122 loc) · 5.7 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
================================================
Clustering in 2D and 3D with/without PCA applied
================================================
"""
import sys
from contextlib import contextmanager
import time
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from filehandler import Filehandler
from dataset import KDDCup1999
from visualize import Visualize
@contextmanager
def timer(title):
t0 = time.time()
yield
print('{} - done in {:.0f}s'.format(title, time.time() - t0))
class Clustering:
def __init__(self):
self.logfile = None
self.gettrace = getattr(sys, 'gettrace', None)
self.original_stdout = sys.stdout
self.timestr = time.strftime("%Y%m%d-%H%M%S")
self.log_file()
print(__doc__)
self.filehandler = Filehandler()
self.ds = KDDCup1999()
self.visualize = Visualize()
self.random_state = 20
self.clusters_stop = 11
self.x = None
self.y = None
self.full = None
self.ac_count = {}
self.feature_idx = {0: 0, 1: 0, 2: 0}
self.pca_idx = {0: 0, 1: 1, 2: 2, 'pca': True}
self.kernelpca_idx = {0: 0, 1: 1, 2: 2, 'kpca': True}
self.scale_cols = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login',
'count', 'srv_count', 'serror_rate', 'rerror_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']
self.cluster_cols = [('count', 'diff_srv_rate', 'src_bytes'),
('src_bytes', 'dst_host_srv_count', 'dst_bytes'),
('srv_diff_host_rate', 'srv_count', 'serror_rate'),
('serror_rate', 'dst_host_diff_srv_rate', 'flag')]
with timer('\nLoading dataset'):
self.load_data()
self.ds.shape()
with timer('\nEncode and Scale dataset'):
self.encode_scale()
with timer('\nSetting X and y'):
self.set_x_y()
with timer('\nPlotting clusters for specific columns'):
for cola, colb, colc in self.cluster_cols:
for c in range(2, self.clusters_stop):
self.set_indexes(cola, colb, colc)
with timer('\n2D clustering without PCA'):
self.cluster(idx=self.feature_idx, n_clusters=c)
with timer('\n3D clustering without PCA'):
self.cluster(idx=self.feature_idx, n_clusters=c, projection='3d')
with timer('\nPlotting clusters applying PCA'):
for c in range(2, self.clusters_stop):
with timer('\n2D clustering with PCA'):
self.cluster(idx=self.pca_idx, n_clusters=c)
with timer('\n3D clustering with PCA'):
self.cluster(idx=self.pca_idx, n_clusters=c, projection='3d')
# Commented out due to memory error
#with timer('\nPlotting clusters Kernel applying PCA'):
# for c in range(2, 7):
# with timer('\n2D clustering with Kernel PCA'):
# self.cluster(idx=self.kernelpca_idx, n_clusters=c)
# with timer('\n3D clustering with Kernel PCA'):
# self.cluster(idx=self.kernelpca_idx, n_clusters=c, projection='3d')
self.log_file()
print('Finished')
def log_file(self):
if self.gettrace is None:
pass
elif self.gettrace():
pass
else:
if self.logfile:
sys.stdout = self.original_stdout
self.logfile.close()
self.logfile = False
else:
# Redirect stdout to file for logging if not in debug mode
self.logfile = open('logs/{}_{}_stdout.txt'.format(self.__class__.__name__, self.timestr), 'w')
sys.stdout = self.logfile
def load_data(self):
self.ds.dataset = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_processed')
self.ds.target = self.filehandler.read_csv(self.ds.config['path'], self.ds.config['file'] + '_target')
self.full = pd.concat([self.ds.dataset, self.ds.target], axis=1)
def encode_scale(self):
# Encode categoricals
le = preprocessing.LabelEncoder()
self.full['protocol_type'] = le.fit_transform(self.full['protocol_type'])
self.full['service'] = le.fit_transform(self.full['service'])
self.full['flag'] = le.fit_transform(self.full['flag'])
# Scale
sc = StandardScaler()
self.full[self.scale_cols] = sc.fit_transform(self.full[self.scale_cols])
def set_x_y(self):
self.x = self.full.iloc[:, :-2]
self.y = self.full['target']
def set_indexes(self, cola, colb, colc):
self.feature_idx[0] = self.x.columns.get_loc(cola)
self.feature_idx[1] = self.x.columns.get_loc(colb)
self.feature_idx[2] = self.x.columns.get_loc(colc)
def cluster(self, idx, n_clusters, projection=None):
df_x = self.x
kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state)
kmeans.fit(df_x)
y_km = kmeans.fit_predict(df_x)
self.visualize.scatter_clusters(self.x, n_clusters, y_km, idx, projection)
clustering = Clustering()