-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgen_feature_data.py
196 lines (150 loc) · 6.39 KB
/
gen_feature_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import json
import extract_features
import preprocess
import config
from pprint import pprint
import os.path
import sys
import time
def load_json(filename):
''' Small helper to easily load json files '''
return json.loads(open(filename).read())
def getBills(votes):
'''
Returns:
List of bills associated with these votes
'''
bills = []
vote_bill_map = load_json('vote_bill_map')
for i, vote in enumerate(votes):
vote_link = vote['vote']
bill_id = vote_bill_map[vote_link]
bill = load_json('bill_map/'+bill_id)
bills.append(bill)
return bills
def getData(rep_id, data_directory, preprocess_data=None):
'''
Loads any cached data from previous preprocessing to avoid time sinks.
Caches the final dataset for this rep under TrainingData/<rep_id>
Returns:
Training set for this rep in the form:
{ 'labels': A list of labels for each feature, 'data': List of data point objects}
'''
print 'Getting votes and bills...'
votes = load_json(data_directory+'/'+rep_id) # Get the training votes for this rep
bills = getBills(votes)
print '\tdone.'
# Generate information about the bills used for later feature generation
if preprocess_data == None:
preprocess_data = preprocess.preprocess(rep_id, bills)
data_points = []
all_labels = None
label_hash = None
print "Generating all feature vectors..."
# Iterate through all the votes and bills and compile the final data set
for i, v in enumerate(votes):
if i % 20 == 0:
a_string = str(int(float(i)/float(len(votes))*100))+'%'
sys.stdout.write('\b%s%s'%(a_string,'\b' * len(a_string))) # '\b' erases a char
# Our final point object to feed into training. woo
point = {
'option': v['option'],
'bill': bills[i],
'vote_obj': v
}
# Generate the input vector for this vote
vector, labels = extract_features.generate_feature_vector(point['bill'], preprocess_data)
point['vector'] = vector
# If the labels for this vector are different from the previous,
# the feature generation is messed up or out of order
if all_labels == None:
all_labels = labels
else:
if labels != all_labels:
print "Error: Labels differ on data points. Feature vector generation is messed up."
# Ignore abstaining votes
if point['option']=="+" or point['option']=="-" or config.remove_abstaining_votes == False:
data_points.append(point)
# Save off the data and return
result = {'labels':labels, 'data':data_points}
return result
def getTrainingData(rep_id):
'''
Get the training dataset for this rep.
Note: Passing None to preprocess_data requests it to be regenerated for this set.
'''
return getData(rep_id, 'rep_votes_train', preprocess_data=None)
def getValidationData(rep_id):
pre_data = json.loads(open('preprocess_data/'+rep_id).read())
return getData(rep_id, 'rep_votes_validation', preprocess_data=pre_data)
def getTestingData(rep_id):
'''
Get the test dataset for this rep
'''
pre_data = json.loads(open('preprocess_data/'+rep_id).read())
return getData(rep_id, 'rep_votes_test', preprocess_data=pre_data)
#generates experiment data for single rep and stores in data_set/<experiment_name>_(train/test)/<rep_id>
def genExperimentData(rep_id, experiment_name, debug=2):
if debug >= 2:
print
print "-------- " + rep_id + ' --------'
print
print ' ---- Generating Training Data ---- '
# Load the cached vectors if they exist, otherwise generate them and save them
train_path = 'data_set/'+experiment_name+'_train/'+rep_id
if os.path.exists(train_path) and not config.force_generate_features:
if debug >= 1: print 'Pass. Using '+train_path+' training data.'
else:
# Get our training data
train = getTrainingData(rep_id)
# Create vector directory to save if necessary
if not os.path.exists('data_set/'+experiment_name+'_train'):
os.makedirs('data_set/'+experiment_name+'_train')
f = open(train_path, 'w')
f.write(json.dumps(train))
f.close()
if config.validate:
if debug >= 2:
print
print ' ---- Generating Validation Data ---- '
# Load the cached vectors if they exist, otherwise generate them and save them
validation_path = 'data_set/'+experiment_name+'_validation/'+rep_id
if os.path.exists(validation_path) and not config.force_generate_features:
if debug >= 1: print 'Pass. Using '+validation_path+' validation data.'
else:
# Get our training data
validation = getValidationData(rep_id)
# Create vector directory to save if necessary
if not os.path.exists('data_set/'+experiment_name+'_validation'):
os.makedirs('data_set/'+experiment_name+'_validation')
f = open(validation_path, 'w')
f.write(json.dumps(validation))
f.close()
if debug >= 2:
print
print ' ---- Generating Testing Data ---- '
# Load the cached vectors if they exist, otherwise generate them and save them
test_path = 'data_set/'+experiment_name+'_test/'+rep_id
if os.path.exists(test_path) and not config.force_generate_features:
if debug >= 1: print 'Pass. Using '+test_path+' testing data.'
else:
# Get our testing data
test = getTestingData(rep_id)
# Create vector directory to save if necessary
if not os.path.exists('data_set/'+experiment_name+'_test'):
os.makedirs('data_set/'+experiment_name+'_test')
f = open(test_path, 'w')
f.write(json.dumps(test))
f.close()
#generates experiment data for all reps and stores in TrainingData/<rep_id>
def genAllExperimentData(experiment_name):
personlist = json.loads(open('representatives').read())
count = 0
for rep_id in personlist:
print
print '====================================='
print ' '+rep_id+' '+str(count)+'/'+str(len(personlist))
print '====================================='
print
genExperimentData(rep_id, experiment_name)
count += 1