-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathsvm book.R
421 lines (333 loc) · 14.7 KB
/
svm book.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
library(stats)
library(ggplot2)
library(dplyr)
library(stringr)
library("MASS")
library("car")
library(caTools)
library(e1071)
library(caret)
library(readr)
library(kernlab)
#************************************ Rough Code Draft***************************************#
##Note: Performance Linear Regression to predict Age=0.75119
##Note: Performance Missing Age replaced with median/mean=0.75598
##Trying SVM model from code line 435
survival_db<- read.csv("train.csv", stringsAsFactors = FALSE, na.strings = c("", " ", "na", "NA"))
test_db<- read.csv("test.csv", stringsAsFactors = FALSE, na.strings = c("", " ", "na", "NA"))
sapply(survival_db, function(x) sum(is.na(x)))
#Missing Values in input DB
#Age= 177
#Cabin= 687[Extract Deck]
#Embarked= 2*
sapply(test_db, function(x) sum(is.na(x)))
#Cabin=327[deleted]
#Fare=1
#Age=86
#Lets Round off Age to one digit after the decimal space
survival_db$Age<- round(survival_db$Age, 1)
survival_db$Embarked[which(is.na(survival_db$Embarked))]<- "S"
#62 and 830 have missing values for Embarked Filled with "S"
dump1<-filter(survival_db, Pclass==1 & Fare>=80 & SibSp==0 & Parch==0 & Fare<=90 )
ggplot(survival_db, aes(x=as.factor(survival_db$Embarked)))+geom_bar()
boxplot(survival_db$Fare~survival_db$Embarked)
#Extracting Ticket Number from the Ticket column
dump2<-data.frame(str_split(survival_db$Ticket, pattern = " ", n=2), stringsAsFactors = FALSE)
dump2<- data.frame(t(dump2))
rownames(dump2)<- 1:nrow(survival_db)
colnames(dump2)[2]<- "Ticket_Number"
survival_db<-cbind(survival_db, dump2$Ticket_Number)
rownames(survival_db)<- 1:nrow(survival_db)
colnames(survival_db)[13]<- "Ticket_Number"
survival_db$Ticket_Number<- as.character(survival_db$Ticket_Number)
survival_db$Ticket_Number<-str_replace(survival_db$Ticket_Number, pattern = "2. ", replacement = "")
survival_db$Ticket_Number[474]<-"541"
survival_db$Ticket_Number<- str_replace(survival_db$Ticket_Number, pattern = "LINE", replacement = "1466287")
survival_db$Ticket_Number<- as.numeric(as.character(survival_db$Ticket_Number))
#Rounding off Fare to one digit
survival_db$Fare<- round(survival_db$Fare, 1)
#Creating the Deck Column
survival_db$Deck<- str_sub(survival_db$Cabin, start = 1L, end = 1L)
survival_db$Deck[which(is.na(survival_db$Deck)==T)]<- "Unknown"
unique(survival_db$Deck)
which(survival_db$Deck=="T")
survival_db[340,]
#Clearly ther is only one instance with which the Deck is "T", we will replace this record with "Unknown" to maintain uniformity with test dataset
survival_db$Deck[which(survival_db$Deck=="T")]<- "Unknown"
#Extracting Name Title
dump1<- data.frame(str_split(survival_db$Name, pattern = ", ", n=2), stringsAsFactors = FALSE)
dump1<-data.frame(t(dump1))
dump2<- data.frame(str_split(dump1$X2, pattern = " ", n=2), stringsAsFactors = FALSE)
dump2<-data.frame(t(dump2))
rownames(dump2)<- 1:nrow(dump2)
dump2$X1<- as.character(dump2$X1)
dump2$X1[which(dump2$X1=="the")]<- "Countess."
survival_db<- cbind(survival_db, dump2$X1)
colnames(survival_db)[15]<- "Title"
survival_db$Title<- as.character(survival_db$Title)
#Removing un-necessary columns from the dataset
survival_db<- survival_db[,-c(4,9,11)]
rm(dump1)
rm(dump2)
#Computing the total number of family members
survival_db$Family<- survival_db$SibSp + survival_db$Parch
#Regularizing the Title
unique(survival_db$Title)
men<- c("Mr.", "Don.", "Rev.","Major.","Sir.","Col.", "Capt.", "Jonkheer.")
girls<- c("Ms." , "Mlle.")
women<- c("Mrs.", "Mme.","Lady.", "Countess.", "Dona.")
nam_title_reg<- function(inp_str, gender)
{
if(inp_str %in% men)
return("Mr.")
else if(inp_str %in% women)
return("Mrs.")
else if(inp_str %in% girls)
return("Miss.")
else if(inp_str=="Dr." & gender=="male")
return("Mr.")
else if(inp_str=="Dr." & gender=="female")
return("Mrs.")
else return(inp_str)
}
for(i in 1:nrow(survival_db))
{
survival_db$Title[i]<- nam_title_reg(survival_db$Title[i], survival_db$Sex[i])
}
#Creating a Ratio Metric
survival_db$Ratio<- survival_db$Fare/(survival_db$Family+1)
survival_db$Ratio<- round(survival_db$Ratio, 1)
#Building a LM model to derive Missing Age values
#Trying Correlation plots
dabba<- survival_db[,c(1,3,5,6,7,8,10,13,14)]
dabba<- na.omit(dabba)
cor(dabba)
#Trying an LinReg to predict age.
set.seed(85)
jix<- sample(1:nrow(dabba), 0.8*nrow(dabba))
zax_trng<- dabba[jix,]
zwx_test<- dabba[-jix,]
names(zax_trng)
mod_1<-lm(Age~., data=zax_trng)
summary(mod_1)
mod_2<- stepAIC(mod_1, direction = "both")
summary(mod_2)
vif(mod_2)
#Arrived at the final model Adjusted R-squared: 0.2144
pred_age<- predict(mod_2, zwx_test)
zwx_test<- cbind(zwx_test, pred_age)
zwx_test$pred_age<- round(zwx_test$pred_age, 1)
cor(zwx_test$Age, zwx_test$pred_age)
#0.507 so that's a decent model
#lm(formula = Age ~ Pclass + SibSp + Parch + Ratio, data = zax_trng)
Estimate_age<- predict(mod_2, survival_db)
survival_db<-cbind(survival_db, Estimate_age)
survival_db$Estimate_age[which(survival_db$Estimate_age<=0)]<- median(survival_db$Age, na.rm = TRUE)
survival_db$Estimate_age<- round(survival_db$Estimate_age, 1)
#Imputing Missing Age values
for(i in 1:nrow(survival_db))
{
if(is.na(survival_db$Age[i])==TRUE)
{
survival_db$Age[i]<- survival_db$Estimate_age[i]
}
}
survival_db<- survival_db[,-15]
#Defining a Measure Age*Pclass
survival_db$Measure<- survival_db$Age*survival_db$Pclass
rm(dabba)
rm(zax_trng)
rm(zwx_test)
#************************************ Understanding the Data ***************************************#
#Structure of the dataset
str(survival_db)
#Checking the number of distinct variables in the dataset
length(unique(survival_db$PassengerId))
unique(survival_db$Survived)
#Converting Survived to a factor variable
survival_db$Survived<- factor(survival_db$Survived)
unique(survival_db$Pclass)
unique(survival_db$Sex)
length(unique(survival_db$SibSp))
length(unique(survival_db$Parch))
unique(survival_db$Embarked)
length(unique(survival_db$Deck))
unique(survival_db$Title)
length(unique(survival_db$Family))
names(survival_db)
#Transfering all factor variables to one table and creating dummies
cat_tab<- survival_db[,c(3,4,9,11,12)]
cat_tab<- data.frame(sapply(cat_tab, factor))
str(cat_tab)
cat_tab<- data.frame(model.matrix(~., data = cat_tab))
cat_tab<- cat_tab[,-1]
#Deleting Categorical Columns
survival_db<- survival_db[,-c(3,4,9,11,12)]
#Scaling the numeric variables
num_tab<- survival_db[,-2]
num_tab<- data.frame(sapply(num_tab, scale))
survival_db<- data.frame(survival_db[,-c(1,3:10)])
#Combining the dataframes
survival_db<- cbind(survival_db, num_tab)
survival_db<- cbind(survival_db, cat_tab)
colnames(survival_db)[1]<- "Survived"
#Deleting PassengerID, Fare/Ticket Number
survival_db<- survival_db[,-c(2,7)]
#Splitting the survival_db into test and training dataset
set.seed(85)
index<- sample.split(survival_db$Survived, SplitRatio = 0.8)
trng_data<- survival_db[index,]
test_data<- survival_db[!index,]
names(trng_data)
#************************************ Building SVM Model ***************************************#
#Lets Try our linear svm model
linear_SVM_mod <- ksvm(Survived~.,
data=trng_data,
scale=FALSE,
kernel="vanilladot")
#Please Note: There will be a warning message [In .local(x, ...) : Variable(s) `' constant. Cannot scale data.]
#This warning message is due to scaling issues as most values are nearing zero and have low variability in each column. It can be ignored.
linear_SVM_mod
#Predicting Labels
linmod_predicted_label<- predict(linear_SVM_mod, test_data)
#Evaluating ConfusionMatrix
lin_conf_mat<- confusionMatrix(linmod_predicted_label, test_data$Survived)
lin_conf_mat
#Trying Polynomial SVM---------> This lead to no improvement over the Linear SVM model. Trying the RBF model
poly_SVM_mod <- ksvm(Survived~.,data=trng_data, scale=FALSE, kernel="polydot")
poly_SVM_mod
#Predicting Labels
polymod_predicted_label<- predict(poly_SVM_mod, test_data)
#Evaluating ConfusionMatrix
poly_conf_mat<- confusionMatrix(polymod_predicted_label, test_data$Survived)
poly_conf_mat
#Trying RBF Mod
RBF_nonlin_SVM_mod <- ksvm(Survived~.,
data=trng_data,
scale=FALSE,
kernel="rbfdot")
#Please Note: There will be a warning message [In .local(x, ...) : Variable(s) `' constant. Cannot scale data.]
#This warning message is due to scaling issues as most values are nearing zero and have low variability in each column. It can be ignored.
RBF_nonlin_SVM_mod
#Predicting Labels
RBFmod_predicted_label<- predict(RBF_nonlin_SVM_mod, test_data)
#Evaluating ConfusionMatrix
RBFmod_conf_mat<- confusionMatrix(RBFmod_predicted_label, test_data$Survived)
RBFmod_conf_mat
#Since RBF mod for SVM gave better results we will proceed with tuning the HyperParamters of RBF mod.
#************************************ Hyperparameter Tuning for RBF SVM_Model ***************************************#
#[5.3]Creating the Cross Validation Control Table
cross_valid_control <- trainControl(method = "cv", number = 5, verboseIter=TRUE)
#Setting the cross validation training set evaluation parameter as Accuracy
eval_parameter <- "Accuracy"
#Setting Seed Value for sampling Training Data Used for Cross Validation
set.seed(85)
#Creating HyperParameter Grid to test Sigma Values: [3.2207e-2, 4.2207e-2, 5.2207e-2] and Cost Values: [1, 2, 3]
hyperpar_grid <- expand.grid(.sigma = c(4.2971e-2, 5.2971e-2, 6.2971e-2),.C=c( 1, 2, 3))
#Evaluating the Three Fold HyperParameter Tuning on the basis of Accuracy [evaluation parameter]
RBF_nonlin_tuned_mod.fit<- train(Survived~.,
data=trng_data,
method="svmRadial",
metric=eval_parameter,
tuneGrid=hyperpar_grid,
trControl=cross_valid_control)
#[5.4]Interpretting the results of the 3-fold cross-validation for HyperParameter Tuning
RBF_nonlin_tuned_mod.fit
#Plotting the Results of model Accuracy for each combination of sigma and cost
plot(RBF_nonlin_tuned_mod.fit)
#Using tuned parameters to buid the final RBF model
RBF_nonlin_final_mod <- ksvm(Survived~.,
data=survival_db,
kernel="rbfdot",
scale=FALSE,
C=2,
kpar=list(sigma=4.2971e-2))
RBF_nonlin_final_mod
#Final RBF model's performance
RBFfinmod_test_predict_labels <- predict(RBF_nonlin_final_mod, test_data)
RBFfinmod_test_conf_mat<- confusionMatrix(RBFfinmod_test_predict_labels, test_data$Survived)
RBFfinmod_test_conf_mat
#************************************ Preparing Test Database ***************************************#
sapply(test_db, function(x) sum(is.na(x)))
#Cabin=327[deleted]
#Fare=1
#Age=86
#Rounding off Age to one decimal space.
test_db$Age<- round(test_db$Age, 1)
#There is one missing value for Fare in our Dataset. We will impute the missing value with the median
#This will not impact the performance as Fare is not a significant parameter to our logit model. But it is required to impute the records with missing age
summary(test_db$Fare)
which(is.na(test_db$Fare)==TRUE)
test_db[153,]
dump1<- filter(test_db, test_db$Pclass==3, test_db$Embarked=="S")
summary(dump1$Fare)
#Clearly the data is skewed. We will impute the missing fare with the median value
test_db$Fare[which(is.na(test_db$Fare)==TRUE)]<- median(test_db$Fare, na.rm = TRUE)
test_db$Fare<- round(test_db$Fare,1)
#Creating the Deck Column
test_db$Deck<- str_sub(test_db$Cabin, start = 1L, end = 1L)
test_db$Deck[which(is.na(test_db$Deck)==TRUE)]<- "Unknown"
#Extracting the Title from the name column
dump1<- data.frame(str_split(test_db$Name, pattern = ", ", n=2), stringsAsFactors = FALSE)
dump1<-data.frame(t(dump1))
dump2<- data.frame(str_split(dump1$X2, pattern = " ", n=2), stringsAsFactors = FALSE)
dump2<-data.frame(t(dump2))
rownames(dump2)<- 1:nrow(dump2)
dump2$X1<- as.character(dump2$X1)
unique(dump2$X1)
#From Here we know that "Dona." is a potuguese title given to women. It is equivalent to Mrs.
#There we have added this title in the women vector
test_db<- cbind(test_db, dump2$X1)
colnames(test_db)[13]<- "Title"
test_db$Title<- as.character(test_db$Title)
for(i in 1:nrow(test_db))
{
test_db$Title[i]<- nam_title_reg(test_db$Title[i], test_db$Sex[i])
}
unique(test_db$Title)
#Creating the Family Size Metric
test_db$Family<- test_db$SibSp + test_db$Parch
#Creating the Ratio Metric
test_db$Ratio<- test_db$Fare/(test_db$Family+1)
test_db$Ratio<- round(test_db$Ratio,1)
#Removing the Name, Ticket and Cabin columns from the dataset as these columns are not relavant for out analysis
test_db<- test_db[, -c(3,8,10)]
#Using the Linear Regression Model to Predict the age of the records with the missing Age
Estimate_age<- predict(mod_2, test_db)
test_db<- cbind(test_db, Estimate_age)
test_db$Estimate_age[which((test_db$Estimate_age<=0)==TRUE)]<- median(test_db$Age, na.rm = TRUE)
test_db$Estimate_age<- round(test_db$Estimate_age, 1)
#Imputing missing Age with the Estimate Age
for(i in 1:nrow(test_db))
{
if(is.na(test_db$Age[i])==TRUE)
{
test_db$Age[i]<- test_db$Estimate_age[i]
}
}
#Defining the Measure Column
test_db$Measure<- test_db$Age * test_db$Pclass
#At this Stage we Have successfully adressed all the missing values and feature engineering stages of the Test Dataset
#We Will now prepare the dataset so that it is compatible with our SVM model
#To prepare the dataset we will create dummy variables for the categorical variables and scale all numerical values
#creating dummy variables for the categorical variables
cat_tab<- test_db[,c(2,3,8:10)]
cat_tab<- data.frame(sapply(cat_tab, factor))
cat_tab<- data.frame(model.matrix(~., cat_tab))
#Deck T is clearly missing in the dataset
cat_tab<- cat_tab[,-1]
num_tab<- test_db[,c(4:7,11,12,14)]
num_tab<- sapply(num_tab, scale)
test_db<- cbind(test_db$PassengerId,num_tab, cat_tab)
colnames(test_db)[1]<- "PassengerId"
PassID<- test_db$PassengerId
test_db<- test_db[,-1]
#Using tuned RBF Model to predict survival labels for the test_db dataset
#Predicting the Survival Labels
RBFfinmod_test_predict_labels <- predict(RBF_nonlin_final_mod, test_db)
test_db<- cbind(PassID, test_db, RBFfinmod_test_predict_labels)
colnames(test_db)[1]<- "PassengerId"
colnames(test_db)[24]<-"Survived"
#Submitting the Predictions
amw_svmrbf_titanic<- test_db[,c(1,24)]
write.csv(amw_svmrbf_titanic, "amw_svmrbf_titanic_sub.csv", row.names = FALSE)