-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathKnn.py
152 lines (100 loc) · 3.72 KB
/
Knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#K Nearest Neighbors with Python
#Import Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
#Load the Data
df = pd.read_csv("Classified Data",index_col=0)
df.head()
#Standardize the Variables
#Because the KNN classifier predicts the class of a given test observation
#by identifying the observations that are nearest to it, the scale of the
#variables matters. Any variables that are on a large scale will have a much
#larger effect on the distance between the observations, and hence on the KNN
#classifier, than variables that are on a small scale.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop('TARGET CLASS',axis=1))
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()
#Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_features,df['TARGET CLASS'],
test_size=0.30)
## Using KNN
#Remember that we are trying to come up with a model to predict whether someone
#will TARGET CLASS or not. We'll start with k=1.
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=1, p=2,
weights='uniform')
pred = knn.predict(X_test)
#Predicting and evavluations
#Let's evaluate our knn model.
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
#[[125 18]
# [ 13 144]]
#print(classification_report(y_test,pred))
# precision recall f1-score support
#
# 0 0.91 0.87 0.89 143
# 1 0.89 0.92 0.90 157
#
#avg / total 0.90 0.90 0.90 300
Choosing a K Value
#Let's go ahead and use the elbow method to pick a good K Value:
error_rate = []
# Will take some time
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
#Output for this code can be viewed at : https://tinyurl.com/y8p2kddm
#Here we can see that that after arouns K>23 the error rate just tends to hover
#around 0.06-0.05 Let's retrain the model with that and check the classification report!
# FIRST A QUICK COMPARISON TO OUR ORIGINAL K=1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
WITH K=1
#[[125 18]
# [ 13 144]]
# precision recall f1-score support
#
# 0 0.91 0.87 0.89 143
# 1 0.89 0.92 0.90 157
#
#avg / total 0.90 0.90 0.90 300
# NOW WITH K=23
knn = KNeighborsClassifier(n_neighbors=23)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('WITH K=23')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
WITH K=23
#[[132 11]
# [ 5 152]]
# precision recall f1-score support
#
# 0 0.96 0.92 0.94 143
# 1 0.93 0.97 0.95 157
#avg / total 0.95 0.95 0.95 300