-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
205 lines (175 loc) · 7.7 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# train.py
#
# Copyright 2017 Grant Garrett-Grossman <[email protected]>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# This model is heavily based off of the stk-learn example for film
# reviews
# Nevertheless, it is a notable contribution to society, a program
# capable of discerning whether of not a text is a crackpot conspiracy
import logging
import pprint # Pretty stuff
import re
import sys # For command line arguments
from time import time # to show progress
import numpy as np
from matplotlib import pyplot as plt
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import metrics
from sklearn.datasets import load_files
from sklearn.externals import joblib # In order to save
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from tabulate import tabulate # More pretty stuff
# import matplotlib.pyplot as plt # For plotting confusion matrix
# Tokenizer that does stemming and strips punctuation
def tokenize(text):
text = re.sub(r'\W+', ' ', text)
tokens = word_tokenize(text)
lemas = []
for item in tokens:
lemas.append(WordNetLemmatizer().lemmatize(item))
return lemas
# Plot the most important features
def plot_coefficients(classifier, feature_names, top_features=20):
print("Generating list of most important features")
coef = classifier.coef_.ravel()
top_positive_coefficients = np.argsort(coef)[-top_features:]
top_negative_coefficients = np.argsort(coef)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
# create plot
print("Creating Plot...")
plt.figure(figsize=(15, 5))
colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
feature_names = np.array(feature_names)
plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
plt.savefig('figure.svg', bbox_inches='tight')
plt.show()
if __name__ == "__main__":
# NOTE: we put the following in a 'if __name__ == "__main__"' protected
# block to be able to use a multi-core grid search that also works under
# Windows, see: http://docs.python.org/library/multiprocessing.html#windows
# The multiprocessing module is used as the backend of joblib.Parallel
# that is used when n_jobs != 1 in GridSearchCV
# Display progress logs on stdout
print("Initializing...")
# Command line arguments
save = sys.argv[1]
training_directory = sys.argv[2]
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
dataset = load_files(training_directory, shuffle=False)
print("n_samples: %d" % len(dataset.data))
# split the dataset in training and test set:
print("Splitting the dataset in training and test set...")
docs_train, docs_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.25, random_state=None)
# Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
# Also remove stop words
print("Loading list of stop words...")
with open('stopwords.txt', 'r') as f:
words = [line.strip() for line in f]
print("Stop words list loaded...")
print("Setting up pipeline...")
pipeline = Pipeline(
[
# ('vect', TfidfVectorizer(stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1,1))),
('vect',
TfidfVectorizer(tokenizer=tokenize, stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1, 1))),
('clf', LinearSVC(C=5000)),
])
print("Pipeline:", [name for name, _ in pipeline.steps])
# Build a grid search to optimize parameters
# Fit the pipeline on the training set using grid search for the parameters
print("Initializing grid search...")
# Uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
# 'vect__ngram_range': [(1, 1), (1, 2)],
# 'vect__min_df': (0.0005, 0.001),
# 'vect__max_df': (0.25, 0.5),
# 'clf__C': (10, 15, 20),
}
print("Parameters:")
pprint.pprint(parameters)
grid_search = GridSearchCV(
pipeline,
parameters,
n_jobs=-1,
verbose=True)
print("Training and performing grid search...\n")
t0 = time()
grid_search.fit(docs_train, y_train)
print("\nDone in %0.3fs!\n" % (time() - t0))
# Print the mean and std for each candidate along with the parameter
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
print(i, 'params - %s; mean - %0.2f; std - %0.2f'
% (grid_search.cv_results_['params'][i],
grid_search.cv_results_['mean_test_score'][i],
grid_search.cv_results_['std_test_score'][i]))
# Predict the outcome on the testing set and store it in a variable
# named y_predicted
print("\nRunning against testing set...\n")
y_predicted = grid_search.predict(docs_test)
# Print best parameters
best_parameters = grid_search.best_estimator_.get_params()
print("Best parameters:\n")
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
# Save model
print("\nSaving model to", save, "...")
joblib.dump(grid_search.best_estimator_, save)
print("Model Saved! \nPrepare for some awesome stats!")
# Print the classification report
print("\nClassification Report:")
print(metrics.classification_report(y_test, y_predicted,
target_names=dataset.target_names))
# Print and plot the confusion matrix
print("\nConfusion Matrix:")
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm, "\n \n")
# plt.matshow(cm)
# plt.show()
# Display most important features
n = 20
i = 1
a = []
vectorizer = grid_search.best_estimator_.named_steps['vect']
clf = grid_search.best_estimator_.named_steps['clf']
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
print("Top 10 most important features:")
for (coef_1, fn_1), (coef_2, fn_2) in top:
# print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
a.append([i, coef_1, fn_1, coef_2, fn_2])
i += 1
print(tabulate(a, ["Rank", "Weight", "Bonkers", "Weight", "Fine"], tablefmt="fancy_grid"))
# print(tabulate(a, ["Rank", "Weight", "Bonkers", "Weight", "Fine"], tablefmt="html"))
print("Plotting...")
plot_coefficients(clf, feature_names)