Skip to content

Commit

Permalink
Merge pull request #3588 from matejklemen/preprocess-percentile
Browse files Browse the repository at this point in the history
[ENH] Preprocess: implement Select Relevant Feature's percentile
  • Loading branch information
janezd authored Feb 16, 2019
2 parents 6d95125 + 4b2599c commit 9817fa8
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 26 deletions.
28 changes: 19 additions & 9 deletions Orange/preprocess/fss.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
class SelectBestFeatures(Reprable):
"""
A feature selector that builds a new dataset consisting of either the top
`k` features or all those that exceed a given `threshold`. Features are
scored using the provided feature scoring `method`. By default it is
assumed that feature importance diminishes with decreasing scores.
`k` features (if `k` is an `int`) or a proportion (if `k` is a `float`
between 0.0 and 1.0), or all those that exceed a given `threshold`. Features
are scored using the provided feature scoring `method`. By default it is
assumed that feature importance decreases with decreasing scores.
If both `k` and `threshold` are set, only features satisfying both
conditions will be selected.
Expand All @@ -32,8 +33,8 @@ class SelectBestFeatures(Reprable):
method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer
Univariate feature scoring method.
k : int
The number of top features to select.
k : int or float
The number or propotion of top features to select.
threshold : float
A threshold that a feature should meet according to the provided method.
Expand All @@ -50,6 +51,12 @@ def __init__(self, method=None, k=None, threshold=None, decreasing=True):
self.decreasing = decreasing

def __call__(self, data):
n_attrs = len(data.domain.attributes)
if isinstance(self.k, float):
effective_k = np.round(self.k * n_attrs).astype(int) or 1
else:
effective_k = self.k

method = self.method
# select default method according to the provided data
if method is None:
Expand All @@ -73,7 +80,7 @@ def __call__(self, data):
best = sorted(zip(scores, features), key=itemgetter(0),
reverse=self.decreasing)
if self.k:
best = best[:self.k]
best = best[:effective_k]
if self.threshold:
pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else
(lambda x: x[0] <= self.threshold))
Expand Down Expand Up @@ -113,10 +120,13 @@ def __init__(self, k=0.1):
self.k = k

def __call__(self, data):
if type(self.k) == float:
self.k = int(len(data.domain.attributes) * self.k)
if isinstance(self.k, float):
effective_k = int(len(data.domain.attributes) * self.k)
else:
effective_k = self.k

domain = Orange.data.Domain(
random.sample(data.domain.attributes,
min(self.k, len(data.domain.attributes))),
min(effective_k, len(data.domain.attributes))),
data.domain.class_vars, data.domain.metas)
return data.transform(domain)
33 changes: 33 additions & 0 deletions Orange/tests/test_fss.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,39 @@ def test_select_1(self):
best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1]
self.assertEqual(data2.domain.attributes[0], best)

def test_select_2(self):
gini = Gini()
# 100th percentile = selection of top1 attribute
sel1 = SelectBestFeatures(method=gini, k=1.0)
data2 = sel1(self.titanic)
best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1]
self.assertEqual(data2.domain.attributes[0], best)

# no k and no threshold, select all attributes
sel2 = SelectBestFeatures(method=gini, k=0)
data2 = sel2(self.titanic)
self.assertEqual(len(data2.domain.attributes), len(self.titanic.domain.attributes))

# 31% = selection of top (out of 3) attributes
sel3 = SelectBestFeatures(method=gini, k=0.31)
data2 = sel3(self.titanic)
self.assertEqual(len(data2.domain.attributes), 1)

# 35% = selection of top (out of 3) attributes
sel3 = SelectBestFeatures(method=gini, k=0.35)
data2 = sel3(self.titanic)
self.assertEqual(len(data2.domain.attributes), 1)

# 1% = select one (out of 3) attributes
sel3 = SelectBestFeatures(method=gini, k=0.01)
data2 = sel3(self.titanic)
self.assertEqual(len(data2.domain.attributes), 1)

# number of selected attrs should be relative to number of current input attrs
sel3 = SelectBestFeatures(method=gini, k=1.0)
data2 = sel3(self.wine)
self.assertEqual(len(data2.domain.attributes), 13)

def test_select_threshold(self):
anova = ANOVA()
t = 30
Expand Down
29 changes: 14 additions & 15 deletions Orange/widgets/data/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ class UnivariateFeatureSelect(QWidget):
edited = Signal()

#: Strategy
Fixed, Percentile, FDR, FPR, FWE = 1, 2, 3, 4, 5
Fixed, Proportion, FDR, FPR, FWE = 1, 2, 3, 4, 5

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
Expand All @@ -354,7 +354,7 @@ def __init__(self, parent=None, **kwargs):

self.layout().addWidget(box)

box = QGroupBox(title="Strategy", flat=True)
box = QGroupBox(title="Number of features", flat=True)
self.__group = group = QButtonGroup(self, exclusive=True)
self.__spins = {}

Expand All @@ -370,20 +370,17 @@ def __init__(self, parent=None, **kwargs):
self.__spins[UnivariateFeatureSelect.Fixed] = kspin
form.addRow(fixedrb, kspin)

percrb = QRadioButton("Percentile:")
group.addButton(percrb, UnivariateFeatureSelect.Percentile)
percrb = QRadioButton("Proportion:")
group.addButton(percrb, UnivariateFeatureSelect.Proportion)
pspin = QDoubleSpinBox(
minimum=0.0, maximum=100.0, singleStep=0.5,
minimum=1.0, maximum=100.0, singleStep=0.5,
value=self.__p, suffix="%",
enabled=self.__strategy == UnivariateFeatureSelect.Percentile
enabled=self.__strategy == UnivariateFeatureSelect.Proportion
)

pspin.valueChanged[float].connect(self.setP)
pspin.editingFinished.connect(self.edited)
self.__spins[UnivariateFeatureSelect.Percentile] = pspin
# Percentile controls disabled for now.
pspin.setEnabled(False)
percrb.setEnabled(False)
self.__spins[UnivariateFeatureSelect.Proportion] = pspin
form.addRow(percrb, pspin)

# form.addRow(QRadioButton("FDR"), QDoubleSpinBox())
Expand Down Expand Up @@ -423,9 +420,9 @@ def setK(self, k):
def setP(self, p):
if self.__p != p:
self.__p = p
spin = self.__spins[UnivariateFeatureSelect.Percentile]
spin = self.__spins[UnivariateFeatureSelect.Proportion]
spin.setValue(p)
if self.__strategy == UnivariateFeatureSelect.Percentile:
if self.__strategy == UnivariateFeatureSelect.Proportion:
self.changed.emit()

def setItems(self, itemlist):
Expand Down Expand Up @@ -508,10 +505,12 @@ def createinstance(params):
score = FeatureSelectEditor.MEASURES[score][1]
strategy = params.get("strategy", UnivariateFeatureSelect.Fixed)
k = params.get("k", 10)
p = params.get("p", 75.0)
if strategy == UnivariateFeatureSelect.Fixed:
return preprocess.fss.SelectBestFeatures(score, k=k)
return preprocess.fss.SelectBestFeatures(score(), k=k)
elif strategy == UnivariateFeatureSelect.Proportion:
return preprocess.fss.SelectBestFeatures(score(), k=p / 100)
else:
# TODO: implement top percentile selection
raise NotImplementedError

def __repr__(self):
Expand All @@ -534,7 +533,7 @@ def __init__(self, parent=None, **kwargs):
self.__k = 10
self.__p = 75.0

box = QGroupBox(title="Strategy", flat=True)
box = QGroupBox(title="Number of features", flat=True)
self.__group = group = QButtonGroup(self, exclusive=True)
self.__spins = {}

Expand Down
26 changes: 24 additions & 2 deletions Orange/widgets/data/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
)
from Orange.preprocess import discretize, impute, fss, score
from Orange.widgets.data import owpreprocess
from Orange.widgets.data.owpreprocess import OWPreprocess
from Orange.widgets.data.owpreprocess import OWPreprocess, \
UnivariateFeatureSelect
from Orange.widgets.tests.base import WidgetTest, datasets


Expand Down Expand Up @@ -49,6 +50,27 @@ def test_normalize(self):
np.testing.assert_allclose(output.X.mean(0), 0, atol=1e-7)
np.testing.assert_allclose(output.X.std(0), 1, atol=1e-7)

def test_select_features(self):
data = Table("iris")
saved = {"preprocessors": [("orange.preprocess.fss",
{"strategy": UnivariateFeatureSelect.Fixed,
"k": 2})]}
model = self.widget.load(saved)
self.widget.set_model(model)
self.send_signal(self.widget.Inputs.data, data)
output = self.get_output(self.widget.Outputs.preprocessed_data)
self.assertEqual(len(output.domain.attributes), 2)

saved = {"preprocessors": [
("orange.preprocess.fss",
{"strategy": UnivariateFeatureSelect.Proportion,
"p": 75})]}
model = self.widget.load(saved)
self.widget.set_model(model)
self.send_signal(self.widget.Inputs.data, data)
output = self.get_output(self.widget.Outputs.preprocessed_data)
self.assertEqual(len(output.domain.attributes), 3)

def test_data_column_nans(self):
"""
ZeroDivisonError - Weights sum to zero, can't be normalized
Expand Down Expand Up @@ -126,7 +148,7 @@ def test_editor(self):
widget = owpreprocess.FeatureSelectEditor()
p = widget.createinstance(widget.parameters())
self.assertIsInstance(p, fss.SelectBestFeatures)
self.assertEqual(p.method, score.InfoGain)
self.assertIsInstance(p.method, score.InfoGain)
self.assertEqual(p.k, 10)


Expand Down

0 comments on commit 9817fa8

Please sign in to comment.