Merge pull request #3588 from matejklemen/preprocess-percentile

[ENH] Preprocess: implement Select Relevant Feature's percentile
biolab · Feb 16, 2019 · 9817fa8 · 9817fa8
2 parents 6d95125 + 4b2599c
commit 9817fa8
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 26 deletions.
diff --git a/Orange/preprocess/fss.py b/Orange/preprocess/fss.py
@@ -16,9 +16,10 @@
 class SelectBestFeatures(Reprable):
     """
     A feature selector that builds a new dataset consisting of either the top
-    `k` features or all those that exceed a given `threshold`. Features are
-    scored using the provided feature scoring `method`. By default it is
-    assumed that feature importance diminishes with decreasing scores.
+    `k` features (if `k` is an `int`) or a proportion (if `k` is a `float`
+    between 0.0 and 1.0), or all those that exceed a given `threshold`. Features
+    are scored using the provided feature scoring `method`. By default it is
+    assumed that feature importance decreases with decreasing scores.
 
     If both `k` and `threshold` are set, only features satisfying both
     conditions will be selected.
@@ -32,8 +33,8 @@ class SelectBestFeatures(Reprable):
     method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer
         Univariate feature scoring method.
 
-    k : int
-        The number of top features to select.
+    k : int or float
+        The number or propotion of top features to select.
 
     threshold : float
         A threshold that a feature should meet according to the provided method.
@@ -50,6 +51,12 @@ def __init__(self, method=None, k=None, threshold=None, decreasing=True):
         self.decreasing = decreasing
 
     def __call__(self, data):
+        n_attrs = len(data.domain.attributes)
+        if isinstance(self.k, float):
+            effective_k = np.round(self.k * n_attrs).astype(int) or 1
+        else:
+            effective_k = self.k
+
         method = self.method
         # select default method according to the provided data
         if method is None:
@@ -73,7 +80,7 @@ def __call__(self, data):
         best = sorted(zip(scores, features), key=itemgetter(0),
                       reverse=self.decreasing)
         if self.k:
-            best = best[:self.k]
+            best = best[:effective_k]
         if self.threshold:
             pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else
                     (lambda x: x[0] <= self.threshold))
@@ -113,10 +120,13 @@ def __init__(self, k=0.1):
         self.k = k
 
     def __call__(self, data):
-        if type(self.k) == float:
-            self.k = int(len(data.domain.attributes) * self.k)
+        if isinstance(self.k, float):
+            effective_k = int(len(data.domain.attributes) * self.k)
+        else:
+            effective_k = self.k
+
         domain = Orange.data.Domain(
             random.sample(data.domain.attributes,
-                          min(self.k, len(data.domain.attributes))),
+                          min(effective_k, len(data.domain.attributes))),
             data.domain.class_vars, data.domain.metas)
         return data.transform(domain)
diff --git a/Orange/tests/test_fss.py b/Orange/tests/test_fss.py
@@ -27,6 +27,39 @@ def test_select_1(self):
         best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1]
         self.assertEqual(data2.domain.attributes[0], best)
 
+    def test_select_2(self):
+        gini = Gini()
+        # 100th percentile = selection of top1 attribute
+        sel1 = SelectBestFeatures(method=gini, k=1.0)
+        data2 = sel1(self.titanic)
+        best = max((gini(self.titanic, f), f) for f in self.titanic.domain.attributes)[1]
+        self.assertEqual(data2.domain.attributes[0], best)
+
+        # no k and no threshold, select all attributes
+        sel2 = SelectBestFeatures(method=gini, k=0)
+        data2 = sel2(self.titanic)
+        self.assertEqual(len(data2.domain.attributes), len(self.titanic.domain.attributes))
+
+        # 31% = selection of top  (out of 3) attributes
+        sel3 = SelectBestFeatures(method=gini, k=0.31)
+        data2 = sel3(self.titanic)
+        self.assertEqual(len(data2.domain.attributes), 1)
+
+        # 35% = selection of top  (out of 3) attributes
+        sel3 = SelectBestFeatures(method=gini, k=0.35)
+        data2 = sel3(self.titanic)
+        self.assertEqual(len(data2.domain.attributes), 1)
+
+        # 1% = select one (out of 3) attributes
+        sel3 = SelectBestFeatures(method=gini, k=0.01)
+        data2 = sel3(self.titanic)
+        self.assertEqual(len(data2.domain.attributes), 1)
+
+        # number of selected attrs should be relative to number of current input attrs
+        sel3 = SelectBestFeatures(method=gini, k=1.0)
+        data2 = sel3(self.wine)
+        self.assertEqual(len(data2.domain.attributes), 13)
+
     def test_select_threshold(self):
         anova = ANOVA()
         t = 30

diff --git a/Orange/widgets/data/owpreprocess.py b/Orange/widgets/data/owpreprocess.py
@@ -334,7 +334,7 @@ class UnivariateFeatureSelect(QWidget):
     edited = Signal()
 
     #: Strategy
-    Fixed, Percentile, FDR, FPR, FWE = 1, 2, 3, 4, 5
+    Fixed, Proportion, FDR, FPR, FWE = 1, 2, 3, 4, 5
 
     def __init__(self, parent=None, **kwargs):
         super().__init__(parent, **kwargs)
@@ -354,7 +354,7 @@ def __init__(self, parent=None, **kwargs):
 
         self.layout().addWidget(box)
 
-        box = QGroupBox(title="Strategy", flat=True)
+        box = QGroupBox(title="Number of features", flat=True)
         self.__group = group = QButtonGroup(self, exclusive=True)
         self.__spins = {}
 
@@ -370,20 +370,17 @@ def __init__(self, parent=None, **kwargs):
         self.__spins[UnivariateFeatureSelect.Fixed] = kspin
         form.addRow(fixedrb, kspin)
 
-        percrb = QRadioButton("Percentile:")
-        group.addButton(percrb, UnivariateFeatureSelect.Percentile)
+        percrb = QRadioButton("Proportion:")
+        group.addButton(percrb, UnivariateFeatureSelect.Proportion)
         pspin = QDoubleSpinBox(
-            minimum=0.0, maximum=100.0, singleStep=0.5,
+            minimum=1.0, maximum=100.0, singleStep=0.5,
             value=self.__p, suffix="%",
-            enabled=self.__strategy == UnivariateFeatureSelect.Percentile
+            enabled=self.__strategy == UnivariateFeatureSelect.Proportion
         )
 
         pspin.valueChanged[float].connect(self.setP)
         pspin.editingFinished.connect(self.edited)
-        self.__spins[UnivariateFeatureSelect.Percentile] = pspin
-        # Percentile controls disabled for now.
-        pspin.setEnabled(False)
-        percrb.setEnabled(False)
+        self.__spins[UnivariateFeatureSelect.Proportion] = pspin
         form.addRow(percrb, pspin)
 
 #         form.addRow(QRadioButton("FDR"), QDoubleSpinBox())
@@ -423,9 +420,9 @@ def setK(self, k):
     def setP(self, p):
         if self.__p != p:
             self.__p = p
-            spin = self.__spins[UnivariateFeatureSelect.Percentile]
+            spin = self.__spins[UnivariateFeatureSelect.Proportion]
             spin.setValue(p)
-            if self.__strategy == UnivariateFeatureSelect.Percentile:
+            if self.__strategy == UnivariateFeatureSelect.Proportion:
                 self.changed.emit()
 
     def setItems(self, itemlist):
@@ -508,10 +505,12 @@ def createinstance(params):
         score = FeatureSelectEditor.MEASURES[score][1]
         strategy = params.get("strategy", UnivariateFeatureSelect.Fixed)
         k = params.get("k", 10)
+        p = params.get("p", 75.0)
         if strategy == UnivariateFeatureSelect.Fixed:
-            return preprocess.fss.SelectBestFeatures(score, k=k)
+            return preprocess.fss.SelectBestFeatures(score(), k=k)
+        elif strategy == UnivariateFeatureSelect.Proportion:
+            return preprocess.fss.SelectBestFeatures(score(), k=p / 100)
         else:
-            # TODO: implement top percentile selection
             raise NotImplementedError
 
     def __repr__(self):
@@ -534,7 +533,7 @@ def __init__(self, parent=None, **kwargs):
         self.__k = 10
         self.__p = 75.0
 
-        box = QGroupBox(title="Strategy", flat=True)
+        box = QGroupBox(title="Number of features", flat=True)
         self.__group = group = QButtonGroup(self, exclusive=True)
         self.__spins = {}
 

diff --git a/Orange/widgets/data/tests/test_owpreprocess.py b/Orange/widgets/data/tests/test_owpreprocess.py
@@ -8,7 +8,8 @@
 )
 from Orange.preprocess import discretize, impute, fss, score
 from Orange.widgets.data import owpreprocess
-from Orange.widgets.data.owpreprocess import OWPreprocess
+from Orange.widgets.data.owpreprocess import OWPreprocess, \
+    UnivariateFeatureSelect
 from Orange.widgets.tests.base import WidgetTest, datasets
 
 
@@ -49,6 +50,27 @@ def test_normalize(self):
         np.testing.assert_allclose(output.X.mean(0), 0, atol=1e-7)
         np.testing.assert_allclose(output.X.std(0), 1, atol=1e-7)
 
+    def test_select_features(self):
+        data = Table("iris")
+        saved = {"preprocessors": [("orange.preprocess.fss",
+                                    {"strategy": UnivariateFeatureSelect.Fixed,
+                                     "k": 2})]}
+        model = self.widget.load(saved)
+        self.widget.set_model(model)
+        self.send_signal(self.widget.Inputs.data, data)
+        output = self.get_output(self.widget.Outputs.preprocessed_data)
+        self.assertEqual(len(output.domain.attributes), 2)
+
+        saved = {"preprocessors": [
+            ("orange.preprocess.fss",
+             {"strategy": UnivariateFeatureSelect.Proportion,
+              "p": 75})]}
+        model = self.widget.load(saved)
+        self.widget.set_model(model)
+        self.send_signal(self.widget.Inputs.data, data)
+        output = self.get_output(self.widget.Outputs.preprocessed_data)
+        self.assertEqual(len(output.domain.attributes), 3)
+
     def test_data_column_nans(self):
         """
         ZeroDivisonError - Weights sum to zero, can't be normalized
@@ -126,7 +148,7 @@ def test_editor(self):
         widget = owpreprocess.FeatureSelectEditor()
         p = widget.createinstance(widget.parameters())
         self.assertIsInstance(p, fss.SelectBestFeatures)
-        self.assertEqual(p.method, score.InfoGain)
+        self.assertIsInstance(p.method, score.InfoGain)
         self.assertEqual(p.k, 10)