-
Notifications
You must be signed in to change notification settings - Fork 0
/
electronics.py
121 lines (59 loc) · 2.47 KB
/
electronics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
#url = 'https://raw.githubusercontent.com/justmarkham/pydata-dc-2016-tutorial/master/sms.tsv'
#read file into pandas from the working directory
sms = pd.read_table('sms.tsv', header=None, names=['label', 'message'])
# alternative: read file into pandas from a URL
#sms = pd.read_table(url, header=None, names=['label', 'message'])
# examine the shape
print(sms.shape)
# examine the first 10 rows
print(sms.head(10))
# examine the class distribution
print(sms.label.value_counts())
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'nonelec':0, 'ELECTRONICS':1})
# check that the conversion worked
print(sms.head(10))
# how to define X and y (from the iris data) for use with a MODEL
# X = iris.data
# y = iris.target
# print(X.shape)
# print(y.shape)
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# instantiate the vectorizer //we can use tf-idf or feature extracter instead
vect = CountVectorizer()
# learn training data vocabulary(fit), then use it to create a document-term matrix(transform)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
# equivalently: combine fit and transform into a single step
#X_train_dtm = vect.fit_transform(X_train)
# examine the document-term matrix
print("\n document-term of training matrix sparse matric of 4179x7456 where 7456 is the number of features\n")
print(X_train_dtm)
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
#no fit because we used the vocabulatory created in the training will same be used for testing
print("\n document-term of testing matrix dtm of same size of as train \n")
print(X_test_dtm)
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
# train the model using X_train_dtm
nb.fit(X_train_dtm, y_train)
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))