From 8cce3420678f81ae7e2b0b2e31e29f4b30ff2a21 Mon Sep 17 00:00:00 2001 From: Jernej Urankar Date: Tue, 28 Mar 2017 14:38:32 +0200 Subject: [PATCH 1/3] [FIX] Continuize: prevent crashing: column with equal values Crashing prevented when dealing with equal values and choosing option normalize by standard deviation https://sentry.io/biolab/orange3/issues/242520198/ --- Orange/widgets/data/owcontinuize.py | 30 +++++++++++-------- .../widgets/data/tests/test_owcontinuize.py | 14 +++++++++ 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/Orange/widgets/data/owcontinuize.py b/Orange/widgets/data/owcontinuize.py index b0c299e5e17..d0b019a1d40 100644 --- a/Orange/widgets/data/owcontinuize.py +++ b/Orange/widgets/data/owcontinuize.py @@ -1,3 +1,7 @@ +from functools import reduce + +import numpy as np + from AnyQt import QtWidgets from AnyQt.QtCore import Qt @@ -5,6 +9,8 @@ from Orange.util import Reprable from Orange.statistics import distribution from Orange.preprocess import Continuize, Normalize +from Orange.preprocess.transformation import \ + Identity, Indicator, Indicator1, Normalizer from Orange.data.table import Table from Orange.widgets import gui, widget from Orange.widgets.settings import Setting @@ -138,12 +144,6 @@ def send_report(self): ("Value range", self.value_ranges[self.zero_based])]) -from Orange.preprocess.transformation import \ - Identity, Indicator, Indicator1, Normalizer - -from functools import reduce - - class WeightedIndicator(Indicator): def __init__(self, variable, value, weight=1.0): super().__init__(variable, value) @@ -156,7 +156,7 @@ def transform(self, c): return t -class WeightedIndicator_1(Indicator1): +class WeightedIndicator1(Indicator1): def __init__(self, variable, value, weight=1.0): super().__init__(variable, value) self.weight = weight @@ -176,7 +176,7 @@ def make_indicator_var(source, value_ind, weight=None, zero_based=True): elif weight is None: indicator = Indicator1(source, value=value_ind) else: - indicator = WeightedIndicator_1(source, value=value_ind, weight=weight) + indicator = WeightedIndicator1(source, value=value_ind, weight=weight) return Orange.data.ContinuousVariable( "{}={}".format(source.name, source.values[value_ind]), compute_value=indicator @@ -279,7 +279,7 @@ def continuize_var(var, elif multinomial_treatment == Continuize.AsOrdinal: return [ordinal_to_continuous(var)] elif multinomial_treatment == Continuize.AsNormalizedOrdinal: - return [ordinal_to_normalized_continuous(var, zero_based)] + return [ordinal_to_norm_continuous(var, zero_based)] elif multinomial_treatment == Continuize.Indicators: return one_hot_coding(var, zero_based) elif multinomial_treatment == Continuize.FirstAsBase or \ @@ -320,7 +320,7 @@ def ordinal_to_continuous(var): compute_value=Identity(var)) -def ordinal_to_normalized_continuous(var, zero_based=True): +def ordinal_to_norm_continuous(var, zero_based=True): n_values = len(var.values) if zero_based: return normalized_var(var, 0, 1 / (n_values - 1)) @@ -330,8 +330,11 @@ def ordinal_to_normalized_continuous(var, zero_based=True): def normalize_by_span(var, data_or_dist, zero_based=True): dist = _ensure_dist(var, data_or_dist) - v_max, v_min = dist.max(), dist.min() - span = v_max - v_min + if dist.shape[1] > 0: + v_max, v_min = dist.max(), dist.min() + else: + v_max, v_min = 0, 0 + span = (v_max - v_min) if span < 1e-15: span = 1 @@ -344,6 +347,7 @@ def normalize_by_span(var, data_or_dist, zero_based=True): def normalize_by_sd(var, data_or_dist): dist = _ensure_dist(var, data_or_dist) mean, sd = dist.mean(), dist.standard_deviation() + sd = sd if sd > 1e-10 else 1 return normalized_var(var, mean, 1 / sd) @@ -365,7 +369,7 @@ def __call__(self, data): domain = data.domain if (treat == Continuize.ReportError and - any(var.is_discrete and len(var.values) > 2 for var in domain)): + any(var.is_discrete and len(var.values) > 2 for var in domain)): raise ValueError("Domain has multinomial attributes") newdomain = continuize_domain( diff --git a/Orange/widgets/data/tests/test_owcontinuize.py b/Orange/widgets/data/tests/test_owcontinuize.py index 9b5ea604ced..04e291ece1e 100644 --- a/Orange/widgets/data/tests/test_owcontinuize.py +++ b/Orange/widgets/data/tests/test_owcontinuize.py @@ -33,3 +33,17 @@ def test_empty_data(self): widget.unconditional_commit() imp_data = self.get_output("Data") self.assertIsNone(imp_data) + + def test_one_column_equal_values(self): + """ + No crash on a column with equal values and with selected option + normalize by standard deviation. + GH-2144 + """ + table = Table("iris") + table = table[:, 1] + table[:] = 42.0 + self.send_signal("Data", table) + # Normalize.NormalizeBySD + self.widget.continuous_treatment = 2 + self.widget.unconditional_commit() From 26d191df7e85ed9f3a50dd0675ebb8237b186771 Mon Sep 17 00:00:00 2001 From: Jernej Urankar Date: Tue, 28 Mar 2017 14:40:54 +0200 Subject: [PATCH 2/3] [FIX] Continuize: prevent crashing: column with NaN values Crashing prevented when dealing with NaN values and choosing option normalize by standard deviation https://sentry.io/biolab/orange3/issues/227119466/ --- Orange/widgets/data/owcontinuize.py | 5 ++++- Orange/widgets/data/tests/test_owcontinuize.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Orange/widgets/data/owcontinuize.py b/Orange/widgets/data/owcontinuize.py index d0b019a1d40..b5c3e50c7b5 100644 --- a/Orange/widgets/data/owcontinuize.py +++ b/Orange/widgets/data/owcontinuize.py @@ -346,7 +346,10 @@ def normalize_by_span(var, data_or_dist, zero_based=True): def normalize_by_sd(var, data_or_dist): dist = _ensure_dist(var, data_or_dist) - mean, sd = dist.mean(), dist.standard_deviation() + if dist.shape[1] > 0: + mean, sd = dist.mean(), dist.standard_deviation() + else: + mean, sd = 0, 1 sd = sd if sd > 1e-10 else 1 return normalized_var(var, mean, 1 / sd) diff --git a/Orange/widgets/data/tests/test_owcontinuize.py b/Orange/widgets/data/tests/test_owcontinuize.py index 04e291ece1e..651638b3650 100644 --- a/Orange/widgets/data/tests/test_owcontinuize.py +++ b/Orange/widgets/data/tests/test_owcontinuize.py @@ -47,3 +47,21 @@ def test_one_column_equal_values(self): # Normalize.NormalizeBySD self.widget.continuous_treatment = 2 self.widget.unconditional_commit() + + def test_one_column_nan_values_normalize_sd(self): + """ + No crash on a column with NaN values and with selected option + normalize by standard deviation (Not the same issue which is + tested above). + GH-2144 + """ + table = Table("iris") + table[:, 2] = np.NaN + self.send_signal("Data", table) + # Normalize.NormalizeBySD + self.widget.continuous_treatment = 2 + self.widget.unconditional_commit() + table = Table("iris") + table[1, 2] = np.NaN + self.send_signal("Data", table) + self.widget.unconditional_commit() From e8d303f0a208887eb4d5d3c7609376dcbfd085a7 Mon Sep 17 00:00:00 2001 From: Jernej Urankar Date: Tue, 28 Mar 2017 14:43:16 +0200 Subject: [PATCH 3/3] [FIX] Continuize: prevent crashing: NaN and norm. by span Crashing prevented when dealing with NaN values and choosing option normalize by span https://sentry.io/biolab/orange3/issues/242519777/ --- Orange/widgets/data/tests/test_owcontinuize.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Orange/widgets/data/tests/test_owcontinuize.py b/Orange/widgets/data/tests/test_owcontinuize.py index 651638b3650..105439775a4 100644 --- a/Orange/widgets/data/tests/test_owcontinuize.py +++ b/Orange/widgets/data/tests/test_owcontinuize.py @@ -65,3 +65,21 @@ def test_one_column_nan_values_normalize_sd(self): table[1, 2] = np.NaN self.send_signal("Data", table) self.widget.unconditional_commit() + + + def test_one_column_nan_values_normalize_span(self): + """ + No crash on a column with NaN values and with selected option + normalize by span. + GH-2144 + """ + table = Table("iris") + table[:, 2] = np.NaN + self.send_signal("Data", table) + # Normalize.NormalizeBySpan + self.widget.continuous_treatment = 1 + self.widget.unconditional_commit() + table = Table("iris") + table[1, 2] = np.NaN + self.send_signal("Data", table) + self.widget.unconditional_commit()