diff --git a/Orange/preprocess/fss.py b/Orange/preprocess/fss.py index 0603874b085..e6e3620cfa3 100644 --- a/Orange/preprocess/fss.py +++ b/Orange/preprocess/fss.py @@ -16,9 +16,10 @@ class SelectBestFeatures(Reprable): """ A feature selector that builds a new dataset consisting of either the top - `k` features or all those that exceed a given `threshold`. Features are - scored using the provided feature scoring `method`. By default it is - assumed that feature importance diminishes with decreasing scores. + `k` features (if `k` is an `int`) or a proportion (if `k` is a `float` + between 0.0 and 1.0), or all those that exceed a given `threshold`. Features + are scored using the provided feature scoring `method`. By default it is + assumed that feature importance decreases with decreasing scores. If both `k` and `threshold` are set, only features satisfying both conditions will be selected. @@ -32,8 +33,8 @@ class SelectBestFeatures(Reprable): method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer Univariate feature scoring method. - k : int - The number of top features to select. + k : int or float + The number or propotion of top features to select. threshold : float A threshold that a feature should meet according to the provided method. @@ -50,6 +51,12 @@ def __init__(self, method=None, k=None, threshold=None, decreasing=True): self.decreasing = decreasing def __call__(self, data): + n_attrs = len(data.domain.attributes) + if isinstance(self.k, float): + effective_k = np.round(self.k * n_attrs).astype(int) or 1 + else: + effective_k = self.k + method = self.method # select default method according to the provided data if method is None: @@ -73,7 +80,7 @@ def __call__(self, data): best = sorted(zip(scores, features), key=itemgetter(0), reverse=self.decreasing) if self.k: - best = best[:self.k] + best = best[:effective_k] if self.threshold: pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else (lambda x: x[0] <= self.threshold)) @@ -113,10 +120,13 @@ def __init__(self, k=0.1): self.k = k def __call__(self, data): - if type(self.k) == float: - self.k = int(len(data.domain.attributes) * self.k) + if isinstance(self.k, float): + effective_k = int(len(data.domain.attributes) * self.k) + else: + effective_k = self.k + domain = Orange.data.Domain( random.sample(data.domain.attributes, - min(self.k, len(data.domain.attributes))), + min(effective_k, len(data.domain.attributes))), data.domain.class_vars, data.domain.metas) return data.transform(domain) diff --git a/Orange/tests/test_fss.py b/Orange/tests/test_fss.py index 9fcb00fd61c..649f43c015e 100644 --- a/Orange/tests/test_fss.py +++ b/Orange/tests/test_fss.py @@ -27,6 +27,39 @@ def test_select_1(self): best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1] self.assertEqual(data2.domain.attributes[0], best) + def test_select_2(self): + gini = Gini() + # 100th percentile = selection of top1 attribute + sel1 = SelectBestFeatures(method=gini, k=1.0) + data2 = sel1(self.titanic) + best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1] + self.assertEqual(data2.domain.attributes[0], best) + + # no k and no threshold, select all attributes + sel2 = SelectBestFeatures(method=gini, k=0) + data2 = sel2(self.titanic) + self.assertEqual(len(data2.domain.attributes), len(self.titanic.domain.attributes)) + + # 31% = selection of top (out of 3) attributes + sel3 = SelectBestFeatures(method=gini, k=0.31) + data2 = sel3(self.titanic) + self.assertEqual(len(data2.domain.attributes), 1) + + # 35% = selection of top (out of 3) attributes + sel3 = SelectBestFeatures(method=gini, k=0.35) + data2 = sel3(self.titanic) + self.assertEqual(len(data2.domain.attributes), 1) + + # 1% = select one (out of 3) attributes + sel3 = SelectBestFeatures(method=gini, k=0.01) + data2 = sel3(self.titanic) + self.assertEqual(len(data2.domain.attributes), 1) + + # number of selected attrs should be relative to number of current input attrs + sel3 = SelectBestFeatures(method=gini, k=1.0) + data2 = sel3(self.wine) + self.assertEqual(len(data2.domain.attributes), 13) + def test_select_threshold(self): anova = ANOVA() t = 30 diff --git a/Orange/widgets/data/owpreprocess.py b/Orange/widgets/data/owpreprocess.py index c51acc15060..361ec50b299 100644 --- a/Orange/widgets/data/owpreprocess.py +++ b/Orange/widgets/data/owpreprocess.py @@ -334,7 +334,7 @@ class UnivariateFeatureSelect(QWidget): edited = Signal() #: Strategy - Fixed, Percentile, FDR, FPR, FWE = 1, 2, 3, 4, 5 + Fixed, Proportion, FDR, FPR, FWE = 1, 2, 3, 4, 5 def __init__(self, parent=None, **kwargs): super().__init__(parent, **kwargs) @@ -354,7 +354,7 @@ def __init__(self, parent=None, **kwargs): self.layout().addWidget(box) - box = QGroupBox(title="Strategy", flat=True) + box = QGroupBox(title="Number of features", flat=True) self.__group = group = QButtonGroup(self, exclusive=True) self.__spins = {} @@ -370,20 +370,17 @@ def __init__(self, parent=None, **kwargs): self.__spins[UnivariateFeatureSelect.Fixed] = kspin form.addRow(fixedrb, kspin) - percrb = QRadioButton("Percentile:") - group.addButton(percrb, UnivariateFeatureSelect.Percentile) + percrb = QRadioButton("Proportion:") + group.addButton(percrb, UnivariateFeatureSelect.Proportion) pspin = QDoubleSpinBox( - minimum=0.0, maximum=100.0, singleStep=0.5, + minimum=1.0, maximum=100.0, singleStep=0.5, value=self.__p, suffix="%", - enabled=self.__strategy == UnivariateFeatureSelect.Percentile + enabled=self.__strategy == UnivariateFeatureSelect.Proportion ) pspin.valueChanged[float].connect(self.setP) pspin.editingFinished.connect(self.edited) - self.__spins[UnivariateFeatureSelect.Percentile] = pspin - # Percentile controls disabled for now. - pspin.setEnabled(False) - percrb.setEnabled(False) + self.__spins[UnivariateFeatureSelect.Proportion] = pspin form.addRow(percrb, pspin) # form.addRow(QRadioButton("FDR"), QDoubleSpinBox()) @@ -423,9 +420,9 @@ def setK(self, k): def setP(self, p): if self.__p != p: self.__p = p - spin = self.__spins[UnivariateFeatureSelect.Percentile] + spin = self.__spins[UnivariateFeatureSelect.Proportion] spin.setValue(p) - if self.__strategy == UnivariateFeatureSelect.Percentile: + if self.__strategy == UnivariateFeatureSelect.Proportion: self.changed.emit() def setItems(self, itemlist): @@ -508,10 +505,12 @@ def createinstance(params): score = FeatureSelectEditor.MEASURES[score][1] strategy = params.get("strategy", UnivariateFeatureSelect.Fixed) k = params.get("k", 10) + p = params.get("p", 75.0) if strategy == UnivariateFeatureSelect.Fixed: - return preprocess.fss.SelectBestFeatures(score, k=k) + return preprocess.fss.SelectBestFeatures(score(), k=k) + elif strategy == UnivariateFeatureSelect.Proportion: + return preprocess.fss.SelectBestFeatures(score(), k=p / 100) else: - # TODO: implement top percentile selection raise NotImplementedError def __repr__(self): @@ -534,7 +533,7 @@ def __init__(self, parent=None, **kwargs): self.__k = 10 self.__p = 75.0 - box = QGroupBox(title="Strategy", flat=True) + box = QGroupBox(title="Number of features", flat=True) self.__group = group = QButtonGroup(self, exclusive=True) self.__spins = {} diff --git a/Orange/widgets/data/tests/test_owpreprocess.py b/Orange/widgets/data/tests/test_owpreprocess.py index 317013626f5..f8c4ac2036a 100644 --- a/Orange/widgets/data/tests/test_owpreprocess.py +++ b/Orange/widgets/data/tests/test_owpreprocess.py @@ -8,7 +8,8 @@ ) from Orange.preprocess import discretize, impute, fss, score from Orange.widgets.data import owpreprocess -from Orange.widgets.data.owpreprocess import OWPreprocess +from Orange.widgets.data.owpreprocess import OWPreprocess, \ + UnivariateFeatureSelect from Orange.widgets.tests.base import WidgetTest, datasets @@ -49,6 +50,27 @@ def test_normalize(self): np.testing.assert_allclose(output.X.mean(0), 0, atol=1e-7) np.testing.assert_allclose(output.X.std(0), 1, atol=1e-7) + def test_select_features(self): + data = Table("iris") + saved = {"preprocessors": [("orange.preprocess.fss", + {"strategy": UnivariateFeatureSelect.Fixed, + "k": 2})]} + model = self.widget.load(saved) + self.widget.set_model(model) + self.send_signal(self.widget.Inputs.data, data) + output = self.get_output(self.widget.Outputs.preprocessed_data) + self.assertEqual(len(output.domain.attributes), 2) + + saved = {"preprocessors": [ + ("orange.preprocess.fss", + {"strategy": UnivariateFeatureSelect.Proportion, + "p": 75})]} + model = self.widget.load(saved) + self.widget.set_model(model) + self.send_signal(self.widget.Inputs.data, data) + output = self.get_output(self.widget.Outputs.preprocessed_data) + self.assertEqual(len(output.domain.attributes), 3) + def test_data_column_nans(self): """ ZeroDivisonError - Weights sum to zero, can't be normalized @@ -126,7 +148,7 @@ def test_editor(self): widget = owpreprocess.FeatureSelectEditor() p = widget.createinstance(widget.parameters()) self.assertIsInstance(p, fss.SelectBestFeatures) - self.assertEqual(p.method, score.InfoGain) + self.assertIsInstance(p.method, score.InfoGain) self.assertEqual(p.k, 10)