biolab · lanzagar · Jan 26, 2017 · Jan 25, 2017 · Jan 25, 2017 · janezd
diff --git a/Orange/widgets/data/owdatasampler.py b/Orange/widgets/data/owdatasampler.py
@@ -46,6 +46,9 @@ class OWDataSampler(OWWidget):
     number_of_folds = Setting(10)
     selectedFold = Setting(1)
 
+    class Warning(OWWidget.Warning):
+        could_not_stratify = Msg("Stratification failed\n{}")
+
     class Error(OWWidget.Error):
         too_many_folds = Msg("Number of folds exceeds data size")
         sample_larger_than_data = Msg("Sample must be smaller than data")
@@ -252,10 +255,17 @@ def updateindices(self):
             self.indices = None
             return
 
-        rnd = self.RandomSeed if self.use_seed else None
         stratified = (self.stratify and
                       type(self.data) == Table and
                       self.data.domain.has_discrete_class)
+        try:
+            self.indices = self.sample(data_length, size, stratified)
+        except ValueError as ex:
+            self.Warning.could_not_stratify(str(ex))
+            self.indices = self.sample(data_length, size, stratified=False)
+
+    def sample(self, data_length, size, stratified):
+        rnd = self.RandomSeed if self.use_seed else None
         if self.sampling_type == self.FixedSize:
             self.indice_gen = SampleRandomN(
                 size, stratified=stratified, replace=self.replacement,
@@ -268,7 +278,7 @@ def updateindices(self):
         else:
             self.indice_gen = SampleFoldIndices(
                 self.number_of_folds, stratified=stratified, random_state=rnd)
-        self.indices = self.indice_gen(self.data)
+        return self.indice_gen(self.data)
 
     def send_report(self):
         if self.sampling_type == self.FixedProportion:
@@ -375,7 +385,14 @@ def __init__(self, size=0, random_state=None):
         self.size = size
         self.random_state = random_state
 
-    def __call__(self):
+    def __call__(self, table=None):
+        """Bootstrap indices
+
+        Args:
+            table: Not used (but part of the signature)
+        Returns:
+            tuple (out_of_sample, sample) indices
+        """
         rgen = np.random.RandomState(self.random_state)
         sample = rgen.randint(0, self.size, self.size)
         sample.sort()  # not needed for the code below, just for the user

diff --git a/Orange/widgets/data/tests/test_owdatasampler.py b/Orange/widgets/data/tests/test_owdatasampler.py
@@ -12,7 +12,7 @@ def setUpClass(cls):
         cls.iris = Table("iris")
 
     def setUp(self):
-        self.widget = self.create_widget(OWDataSampler)
+        self.widget = self.create_widget(OWDataSampler)  # type: OWDataSampler
 
     def test_error_message(self):
         """ Check if error message appears and then disappears when
@@ -26,3 +26,36 @@ def test_error_message(self):
         self.assertFalse(self.widget.Error.too_many_folds.is_shown())
         self.send_signal("Data", Table(self.iris.domain))
         self.assertTrue(self.widget.Error.no_data.is_shown())
+
+    def test_stratified_on_unbalanced_data(self):
+        unbalanced_data = self.iris[:51]
+
+        self.widget.controls.stratify.setChecked(True)
+        self.send_signal("Data", unbalanced_data)
+        self.assertTrue(self.widget.Warning.could_not_stratify.is_shown())
+
+    def test_bootstrap(self):
+        self.select_sampling_type(self.widget.Bootstrap)
+
+        self.send_signal("Data", self.iris)
+
+        in_input = set(self.iris.ids)
+        sample = self.get_output("Data Sample")
+        in_sample = set(sample.ids)
+        in_remaining = set(self.get_output("Remaining Data").ids)
+
+        # Bootstrap should sample len(input) instances
+        self.assertEqual(len(sample), len(self.iris))
+        # Sample and remaining should cover all instances, while none
+        # should be present in both
+        self.assertEqual(len(in_sample | in_remaining), len(in_input))
+        self.assertEqual(len(in_sample & in_remaining), 0)
+        # Sampling with replacement will always produce at least one distinct
+        # instance in sample, and at least one instance in remaining with
+        # high probability (1-(1/150*2/150*...*150/150) ~= 1-2e-64)
+        self.assertGreater(len(in_sample), 0)
+        self.assertGreater(len(in_remaining), 0)
+
+    def select_sampling_type(self, sampling_type):
+        buttons = self.widget.controls.sampling_type.group.buttons()
+        buttons[sampling_type].click()