-
Notifications
You must be signed in to change notification settings - Fork 72
/
BigDataRuleListClassifier.py
118 lines (95 loc) · 4.72 KB
/
BigDataRuleListClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
import pandas as pd
import numbers
from sklearn.ensemble import RandomForestClassifier
from RuleListClassifier import RuleListClassifier
class BigDataRuleListClassifier(RuleListClassifier):
"""
A scikit-learn compatible wrapper for the Bayesian Rule List
classifier by Benjamin Letham, adapted to work on large datasets. It
trains a linear SVM first, takes the subset of the training data closest
to the decision boundary (specified by the parameter training_subset),
which is most critical to learning a classifier, and then uses this subset
to learn a rule list.
It produces a highly interpretable model (a list of decision rules) of
the same form as an expert system.
Parameters
----------
training_subset : float, optional (default=0.1)
Determines the fraction of the data to use for training the Bayesian
Rule List classifier (the data points closest to a linear decision
boundary are selected).
subset_estimator: BaseEstimator, optional (default=RandomForestClassifier)
An Estimator which is able to produce probabilities, used for finding
the subset of the data which is closest to the decision boundary
listlengthprior : int, optional (default=3)
Prior hyperparameter for expected list length (excluding null rule)
listwidthprior : int, optional (default=1)
Prior hyperparameter for expected list length (excluding null rule)
maxcardinality : int, optional (default=1)
Maximum cardinality of an itemset
minsupport : int, optional (default=10)
Minimum support (%) of an itemset
alpha : array_like, shape = [n_classes]
prior hyperparameter for multinomial pseudocounts
n_chains : int, optional (default=3)
Number of MCMC chains for inference
max_iter : int, optional (default=50000)
Maximum number of iterations
class1label: str, optional (default="class 1")
Label or description of what the positive class (with y=1) means
verbose: bool, optional (default=True)
Verbose output
"""
def __init__(self, training_subset=0.1, subset_estimator=RandomForestClassifier(), listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha = np.array([1.,1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=True):
self.training_subset = training_subset
self.subset_estimator = subset_estimator
self.listlengthprior = listlengthprior
self.listwidthprior = listwidthprior
self.maxcardinality = maxcardinality
self.minsupport = minsupport
self.alpha = alpha
self.n_chains = n_chains
self.max_iter = max_iter
self.class1label = class1label
self.verbose = verbose
self._zmin = 1
self.thinning = 1 #The thinning rate
self.burnin = self.max_iter//2 #the number of samples to drop as burn-in in-simulation
self.discretizer = None
self.d_star = None
def _setdata(self, X, y, feature_labels=[], undiscretized_features = []):
self._setlabels(X, feature_labels)
for fi in range(len(X[0])):
if not isinstance(X[0][fi], numbers.Number):
raise Exception("Sorry, only numeric data is supported by BigDataRuleListClassifier at this time")
Xn = np.array(X)
# train subset estimator if necessary
try:
self.subset_estimator.predict_proba(Xn[0])
except:
self.subset_estimator.fit(X, y)
# calculate distances from decision boundary for each point
dist = np.abs(0.5-self.subset_estimator.predict_proba(Xn)[:, 1])
ones_idx = np.where(y==1)[0]
zeros_idx = np.where(y==0)[0]
dist_ones = dist[ones_idx]
dist_zeros = dist[zeros_idx]
# take closest training_subset portion of data, preserving class imbalance
if self.verbose:
print "Reduced from", len(X)
n = int(len(y)*self.training_subset)
bestidx_ones = np.argsort(-dist_ones)
bestidx_zeros = np.argsort(-dist_zeros)
one_fraction = len(np.where(y==1)[0])/float(len(y))
keep_idx = ones_idx[bestidx_ones[:(int(n*one_fraction)+1)]]
keep_idx = np.hstack((keep_idx, zeros_idx[bestidx_zeros[:(int(n*(1-one_fraction))+1)]]))
if type(X) == pd.DataFrame:
X = X.iloc[keep_idx, :]
else:
X = np.array(X)[keep_idx, :]
y = np.array(y)[keep_idx].astype(int)
if self.verbose:
print "...to", len(X), " data points"
X = self._discretize_mixed_data(X, y, undiscretized_features)
return X, y