-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess.py
259 lines (214 loc) · 9.1 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
'''
Author - Nandadeep Davuluru
Email - [email protected]
Course - CS 545 Machine learning
Instructor - Anthony Rhodes
Task - Naive Bayes and logistic regression
'''
import numpy, random, pandas as pd
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plotter
from sklearn.metrics import confusion_matrix
import itertools
import pdb
# roughly 60/40 spam not spam split will have the following stats
# spam = 1813, 725
# not spam = 2788, 1672
# printing the entire data frame
pd.set_option('expand_frame_repr', False)
#### Confusion Matrix helper functions - sklearn metrics
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=None, normalize=True):
"""
given a sklearn confusion matrix (cm), make a nice plot
Arguments
---------
cm: confusion matrix from sklearn.metrics.confusion_matrix
target_names: given classification classes such as [0, 1, 2]
the class names, for example: ['high', 'medium', 'low']
title: the text to display at the top of the matrix
cmap: the gradient of the values displayed from matplotlib.pyplot.cm
see http://matplotlib.org/examples/color/colormaps_reference.html
plotter.get_cmap('jet') or plotter.cm.Blues
normalize: If False, plot the raw numbers
If True, plot the proportions
Usage
-----
plot_confusion_matrix(cm = cm, # confusion matrix created by
# sklearn.metrics.confusion_matrix
normalize = True, # show proportions
target_names = y_labels_vals, # list of names of the classes
title = best_estimator_name) # title of graph
Citiation
---------
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
https://www.kaggle.com/grfiv4/plot-a-confusion-matrix
"""
accuracy = numpy.trace(cm) / float(numpy.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plotter.get_cmap('Blues')
plotter.figure(figsize=(10,10))
plotter.imshow(cm, interpolation='nearest', cmap=cmap)
plotter.title(title)
plotter.colorbar()
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plotter.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plotter.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plotter.tight_layout()
plotter.ylabel('True label')
plotter.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plotter.show()
# plotter.savefig('initial.png')
class spambase(object):
def __init__(self):
# OOP style
self.data = pd.read_csv('spambase.data', header=None)
self.data.rename(columns = {57 : 'is_spam'}, inplace=True)
self.statisticsSpam = dict()
self.statisticsNotSpam = dict()
# split train test data
def generateData(self):
# separate dataset into spam and not spam
spam = self.data[self.data['is_spam'] == 1]
notSpam = self.data[self.data['is_spam'] == 0]
# split using sklearn test train split - random 0.6 for training and 0.4 for testing
spam_train, spam_test = train_test_split(spam, train_size=0.6)
notSpam_train, notSpam_test = train_test_split(notSpam, train_size=0.6)
# divide the data from labels for easier fitting
trainData = notSpam_train.append(spam_train)
trainLabels = trainData.pop('is_spam')
testData = notSpam_test.append(spam_test)
testLabels = testData.pop('is_spam')
# return them
return trainData, trainLabels, testData.values.tolist(), testLabels.values.tolist()
def generateStatistics(self, train, trainLabels):
''' builds the probabilistic model. Need to make it less verbose. '''
# converting to list for better manipulation
train = train.values.tolist()
trainLabels = trainLabels.values.tolist()
spam, notSpam = [], []
for index, (row, label) in enumerate(zip(train, trainLabels)):
if label == 1:
spam.append(train[index])
else:
notSpam.append(train[index])
# converting to pandas for better statistic calculation
spamdf = pd.DataFrame(spam)
notSpamdf = pd.DataFrame(notSpam)
# find mean, std for each feature given its class
for column in spamdf.columns:
strColumn = str(column)
meanOfCol = spamdf[column].mean()
stdOfCol = spamdf[column].std()
# to avoid divide by zero error in gaussian naive bayes
if meanOfCol == 0:
meanOfCol = 0.00001
if stdOfCol == 0:
stdOfCol = 0.000001
self.statisticsSpam[strColumn] = [meanOfCol, stdOfCol]
# find mean, std for each feature given its class
for column in notSpamdf.columns:
strColumn = str(column)
meanOfCol = notSpamdf[column].mean()
stdOfCol = notSpamdf[column].std()
# to avoid divide by zero error in gaussian naive bayes
if meanOfCol == 0:
meanOfCol = 0.00001
if stdOfCol == 0:
stdOfCol = 0.000001
self.statisticsNotSpam[strColumn] = [meanOfCol, stdOfCol]
# pdb.set_trace()
return self.statisticsNotSpam, self.statisticsSpam
def alternativeStatistics(self, train, trainLabels):
'''
This function was written for sanity check - to make sure model is built properly.
Fundamentally it is the same as the generateStatistics(). Since I wrote it anyway, Im decided to leave as is.
'''
spamStats, notSpamStats = {}, {}
train = train.values.tolist()
trainLabels = trainLabels.values.tolist()
spam, notspam = list(), list()
for data, label in zip(train, trainLabels):
if label == 0:
notspam.append(data)
else:
spam.append(data)
numpySpam = numpy.array(spam).T
numpyNSpam = numpy.array(notspam).T
for idx, row in enumerate(numpySpam):
spamStats[str(idx)] = [numpy.mean(row), numpy.std(row)]
# print(numpySpam.shape, numpyNSpam.shape)
for idx, row in enumerate(numpyNSpam):
notSpamStats[str(idx)] = [numpy.mean(row), numpy.std(row)]
pdb.set_trace()
# computes prior values for the two classes - count spam/not spam instances
def computePrior(self, labels):
# priorSpam, priorNSpam = 0.0, 0.0
priorSpam = (labels == 1).astype(int).sum() / len(labels)
priorNSpam = (labels == 0).astype(int).sum() / len(labels)
# 0.606, 0.393 approx. if you print them out.
return priorSpam, priorNSpam
def gaussianProbability(self, feature, mean, std):
# computes gaussian probability - given in the slides
first_term = (1 / numpy.sqrt(2 * numpy.pi * numpy.power(std, 2)))
second_term = numpy.exp(-numpy.power((feature - mean), 2) / 2 * numpy.power(std, 2))
prob = first_term * second_term
if prob == 0:
prob = 0.0001
return numpy.log(prob)
# predicting test instances using gaussian probability
def predict(self, testInstance, testLabel, priors):
# appending the prior terms first.
positiveProbabilities = list()
positiveProbabilities.append(priors[0])
negativeProbabilites = list()
negativeProbabilites.append(priors[1])
for i, X in enumerate(testInstance):
# converting to string because dict keys are str while enumerate returns an int
i = str(i)
# calc positive class probability using the dict built above
mean, std = self.statisticsSpam[i]
posp = self.gaussianProbability(X, mean, std)
if numpy.isinf(posp):
posp = 0.000001
positiveProbabilities.append(posp)
# calc negative class probability using the dict built above
mean, std = self.statisticsNotSpam[i]
negp = self.gaussianProbability(X, mean, std)
if numpy.isinf(negp):
negp = 0.000001
negativeProbabilites.append(negp)
positivePred = sum(positiveProbabilities)
negativePred = sum(negativeProbabilites)
if positivePred > negativePred:
return 0
else:
return 1
# need to write accuracy function explicitly. Although its covered in the confusion matrix method.
def main():
classifier = spambase()
train, trainLabels, test, testLabels = classifier.generateData()
stats = classifier.generateStatistics(train, trainLabels)
# stats2 = classifier.alternativeStatistics(train, trainLabels)
# hard coded the priors. But if you actually call the function computePrior() it will give the same thing
# priors = classifier.computePrior(trainLabels.values.tolist())
priors = (0.606, 0.39)
predictions = [classifier.predict(instance, label, priors) for instance, label in zip(test, testLabels)]
# print("predicted spam - {} predicted nspam - {}".format(predictions.count(1), predictions.count(0)))
# print("actual spam - {} actual nspam - {}".format(testLabels.count(1), testLabels.count(0)))
# prints the accuracy, misclassified points, confusion matrix of the two classes.
# cm = confusion_matrix(predictions, testLabels)
# print(cm)
# pdb.set_trace()
plot_confusion_matrix(confusion_matrix(predictions, testLabels))
if __name__ == '__main__':
main()