-
Notifications
You must be signed in to change notification settings - Fork 190
/
pretrained.py
139 lines (115 loc) · 4.27 KB
/
pretrained.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Train a naive Bayes classifier from the IMDb reviews data set
"""
from __future__ import division
from collections import defaultdict
from math import log, exp
from functools import partial
import re
import os
import random
import pickle
import pylab
handle = open("trained", "rb")
sums, positive, negative = pickle.load(handle)
def tokenize(text):
return re.findall("\w+", text)
def negate_sequence(text):
"""
Detects negations and transforms negated words into "not_" form.
"""
negation = False
delims = "?.,!:;"
result = []
words = text.split()
for word in words:
stripped = word.strip(delims).lower()
result.append("not_" + stripped if negation else stripped)
if any(neg in word for neg in frozenset(["not", "n't", "no"])):
negation = not negation
if any(c in word for c in delims):
negation = False
return result
def get_positive_prob(word):
return 1.0 * (positive[word] + 1) / (2 * sums['pos'])
def get_negative_prob(word):
return 1.0 * (negative[word] + 1) / (2 * sums['neg'])
def classify(text, pneg = 0.5, preprocessor=negate_sequence):
words = preprocessor(text)
pscore, nscore = 0, 0
for word in words:
pscore += log(get_positive_prob(word))
nscore += log(get_negative_prob(word))
return pscore > nscore
def classify_demo(text):
words = negate_sequence(text)
pscore, nscore = 0, 0
for word in words:
pdelta = log(get_positive_prob(word))
ndelta = log(get_negative_prob(word))
pscore += pdelta
nscore += ndelta
print "%25s, pos=(%10lf, %10d) \t\t neg=(%10lf, %10d)" % (word, pdelta, positive[word], ndelta, negative[word])
print "\nPositive" if pscore > nscore else "Negative"
print "Confidence: %lf" % exp(abs(pscore - nscore))
return pscore > nscore, pscore, nscore
def test():
strings = [
open("pos_example").read(),
open("neg_example").read(),
"This book was quite good.",
"I think this product is horrible."
]
print map(classify, strings)
def mutual_info(word):
"""
Finds the mutual information of a word with the training set.
"""
cnt_p, cnt_n = sums['pos'], sums['neg']
total = cnt_n + cnt_p
cnt_x = positive[word] + negative[word]
if (cnt_x == 0):
return 0
cnt_x_p, cnt_x_n = positive[word], negative[word]
I = [[0]*2]*2
I[0][0] = (cnt_n - cnt_x_n) * log ((cnt_n - cnt_x_n) * total / cnt_x / cnt_n) / total
I[0][1] = cnt_x_n * log ((cnt_x_n) * total / (cnt_x * cnt_n)) / total if cnt_x_n > 0 else 0
I[1][0] = (cnt_p - cnt_x_p) * log ((cnt_p - cnt_x_p) * total / cnt_x / cnt_p) / total
I[1][1] = cnt_x_p * log ((cnt_x_p) * total / (cnt_x * cnt_p)) / total if cnt_x_p > 0 else 0
return sum(map(sum, I))
def reduce_features(features, stream):
return [word for word in negate_sequence(stream) if word in features]
def feature_selection_experiment(test_set):
"""
Select top k features. Vary k from 1000 to 50000 and plot data
"""
keys = positive.keys() + negative.keys()
sorted_keys = sorted(keys, cmp=lambda x, y: mutual_info(x) > mutual_info(y)) # Sort descending by mutual info
features = set()
num_features, accuracy = [], []
print sorted_keys[-100:]
for k in xrange(0, 50000, 1000):
features |= set(sorted_keys[k:k+1000])
preprocessor = partial(reduce_features, features)
correct = 0
for text, label in test_set:
correct += classify(text) == label
num_features.append(k+1000)
accuracy.append(correct / len(test_set))
print negate_sequence("Is this a good idea")
print reduce_features(features, "Is this a good idea")
pylab.plot(num_features, accuracy)
pylab.show()
def get_paths():
"""
Returns supervised paths annotated with their actual labels.
"""
posfiles = [("./aclImdb/test/pos/" + f, True) for f in os.listdir("./aclImdb/test/pos/")[:500]]
negfiles = [("./aclImdb/test/neg/" + f, False) for f in os.listdir("./aclImdb/test/neg/")[:500]]
return posfiles + negfiles
if __name__ == '__main__':
print mutual_info('good')
print mutual_info('bad')
print mutual_info('incredible')
print mutual_info('jaskdhkasjdhkjincredible')
feature_selection_experiment(get_paths())