Skip to content

Commit

Permalink
Merge pull request #1952 from astaric/stratified-when-possible
Browse files Browse the repository at this point in the history
[FIX] DataSampler: Fix crash when stratifying unbalanced datasets
(cherry picked from commit a8f71bc)

 Conflicts:
	Orange/widgets/data/owdatasampler.py
  • Loading branch information
lanzagar authored and astaric committed Feb 3, 2017
1 parent f315b17 commit a3d67c8
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 2 deletions.
12 changes: 11 additions & 1 deletion Orange/widgets/data/owdatasampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class OWDataSampler(OWWidget):
number_of_folds = Setting(10)
selectedFold = Setting(1)

class Warning(OWWidget.Warning):
could_not_stratify = Msg("Stratification failed\n{}")

class Error(OWWidget.Error):
too_many_folds = Msg("Number of folds exceeds data size")
sample_larger_than_data = Msg("Sample must be smaller than data")
Expand Down Expand Up @@ -251,10 +254,17 @@ def updateindices(self):
self.indices = None
return

rnd = self.RandomSeed if self.use_seed else None
stratified = (self.stratify and
type(self.data) == Table and
self.data.domain.has_discrete_class)
try:
self.sample(data_length, size, stratified)
except ValueError as ex:
self.Warning.could_not_stratify(str(ex))
self.sample(data_length, size, stratified=False)

def sample(self, data_length, size, stratified):
rnd = self.RandomSeed if self.use_seed else None
if self.sampling_type == self.FixedSize:
self.indices = sample_random_n(
self.data, size,
Expand Down
35 changes: 34 additions & 1 deletion Orange/widgets/data/tests/test_owdatasampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def setUpClass(cls):
cls.iris = Table("iris")

def setUp(self):
self.widget = self.create_widget(OWDataSampler)
self.widget = self.create_widget(OWDataSampler) # type: OWDataSampler

def test_error_message(self):
""" Check if error message appears and then disappears when
Expand All @@ -26,3 +26,36 @@ def test_error_message(self):
self.assertFalse(self.widget.Error.too_many_folds.is_shown())
self.send_signal("Data", Table(self.iris.domain))
self.assertTrue(self.widget.Error.no_data.is_shown())

def test_stratified_on_unbalanced_data(self):
unbalanced_data = self.iris[:51]

self.widget.controls.stratify.setChecked(True)
self.send_signal("Data", unbalanced_data)
self.assertTrue(self.widget.Warning.could_not_stratify.is_shown())

def test_bootstrap(self):
self.select_sampling_type(self.widget.Bootstrap)

self.send_signal("Data", self.iris)

in_input = set(self.iris.ids)
sample = self.get_output("Data Sample")
in_sample = set(sample.ids)
in_remaining = set(self.get_output("Remaining Data").ids)

# Bootstrap should sample len(input) instances
self.assertEqual(len(sample), len(self.iris))
# Sample and remaining should cover all instances, while none
# should be present in both
self.assertEqual(len(in_sample | in_remaining), len(in_input))
self.assertEqual(len(in_sample & in_remaining), 0)
# Sampling with replacement will always produce at least one distinct
# instance in sample, and at least one instance in remaining with
# high probability (1-(1/150*2/150*...*150/150) ~= 1-2e-64)
self.assertGreater(len(in_sample), 0)
self.assertGreater(len(in_remaining), 0)

def select_sampling_type(self, sampling_type):
buttons = self.widget.controls.sampling_type.group.buttons()
buttons[sampling_type].click()

0 comments on commit a3d67c8

Please sign in to comment.