From 9e7e5b2228051dbfc60b82962e707a05bbc7d1df Mon Sep 17 00:00:00 2001 From: Andreja Kovacic Date: Mon, 7 Oct 2019 11:28:36 +0200 Subject: [PATCH 1/5] Add bhattacaryya --- Orange/distance/__init__.py | 2 +- Orange/distance/distance.py | 16 ++++++++++++++++ Orange/widgets/unsupervised/owdistances.py | 3 ++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Orange/distance/__init__.py b/Orange/distance/__init__.py index 77dbe871211..3eb37b4f58a 100644 --- a/Orange/distance/__init__.py +++ b/Orange/distance/__init__.py @@ -1,7 +1,7 @@ from .distance import (Distance, DistanceModel, Euclidean, Manhattan, Cosine, Jaccard, SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute, - Mahalanobis, MahalanobisDistance, Hamming) + Mahalanobis, MahalanobisDistance, Hamming, Bhattacharyya) from .base import ( _preprocess, remove_discrete_features, remove_nonbinary_features, impute) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index 87d91ce45dd..f1066bf79f7 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -3,6 +3,7 @@ import numpy as np from scipy import stats +from scipy.spatial.distance import cdist from scipy import sparse as sp import sklearn.metrics as skl_metrics from sklearn.utils.extmath import row_norms, safe_sparse_dot @@ -644,6 +645,21 @@ class PearsonRAbsolute(CorrelationDistance): def fit(self, _): return PearsonModel(True, self.axis, self.impute) +def _bhattacharyya(a, b): + # not a real metric, does not obey triangle inequality + return -np.log(np.sum(np.sqrt(a*b))) + +class Bhattacharyya(Distance): + def fit(self, data): + return BhattacharyyaModel(self.axis, self.impute) + +class BhattacharyyaModel(DistanceModel): + + def compute_distances(self, x1, x2): + if x2 is None: + x2 = x1 + return cdist(x1, x2, _bhattacharyya) + class Mahalanobis(Distance): supports_sparse = False diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index faddbdb5495..49191a614f9 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -21,7 +21,8 @@ ("Absolute Spearman", distance.SpearmanRAbsolute), ("Pearson", distance.PearsonR), ("Absolute Pearson", distance.PearsonRAbsolute), - ("Hamming", distance.Hamming) + ("Hamming2", distance.Hamming), + ('Bhattacharyya', distance.Bhattacharyya) ] From cb6c69813c3968f64e4b3ef2af19760630e526bf Mon Sep 17 00:00:00 2001 From: Andreja Kovacic Date: Tue, 15 Oct 2019 13:49:31 +0200 Subject: [PATCH 2/5] Add bhattcharyya distance docs --- .../source/widgets/unsupervised/distances.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/visual-programming/source/widgets/unsupervised/distances.md b/doc/visual-programming/source/widgets/unsupervised/distances.md index 75c17238ce4..f66976b87df 100644 --- a/doc/visual-programming/source/widgets/unsupervised/distances.md +++ b/doc/visual-programming/source/widgets/unsupervised/distances.md @@ -32,8 +32,9 @@ Distances work well with Orange add-ons, too. The distance matrix can be fed to - [Pearson](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the values, remapped as a distance in a [0, 1] interval) - [Pearson absolute](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the absolute values, remapped as a distance in a [0, 1] interval) - [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) (the number of features at which the corresponding values are different) + - [Bhattacharyya distance](https://en.wikipedia.org/wiki/Bhattacharyya_distance) (Similarity between two probability distributions, not a real distance as it doesn't obey triangle inequality.) - Normalize the features. Normalization is always done column-wise. + Normalize the features. Normalization is always done column-wise. Values are zero centered and scaled. In case of missing values, the widget automatically imputes the average value of the row or the column. The widget works for both numeric and categorical data. In case of categorical data, the distance is 0 if the two values are the same ('green' and 'green') and 1 if they are not ('green' and 'blue'). 3. Tick *Apply Automatically* to automatically commit changes to other widgets. Alternatively, press '*Apply*'. From 93b0494635cad717187e278707e9a873232a8e8b Mon Sep 17 00:00:00 2001 From: Andreja Kovacic Date: Fri, 18 Oct 2019 10:42:20 +0200 Subject: [PATCH 3/5] Add bhattcharyya test --- Orange/distance/distance.py | 20 +++++- Orange/tests/test_distances.py | 66 ++++++++++++++----- Orange/widgets/unsupervised/owdistances.py | 7 +- .../unsupervised/tests/test_owdistances.py | 9 +++ 4 files changed, 80 insertions(+), 22 deletions(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index f1066bf79f7..e7cb4f6e7c7 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -3,7 +3,6 @@ import numpy as np from scipy import stats -from scipy.spatial.distance import cdist from scipy import sparse as sp import sklearn.metrics as skl_metrics from sklearn.utils.extmath import row_norms, safe_sparse_dot @@ -645,20 +644,35 @@ class PearsonRAbsolute(CorrelationDistance): def fit(self, _): return PearsonModel(True, self.axis, self.impute) +def _prob_dist(a): + # Makes the vector sum to one, as to mimick probability distribution. + return a / np.sum(a) + def _bhattacharyya(a, b): # not a real metric, does not obey triangle inequality - return -np.log(np.sum(np.sqrt(a*b))) + a = _prob_dist(a) + b = _prob_dist(b) + if sp.issparse(a): + return - np.log(np.sum(np.sqrt(a.multiply(b)))) + return - np.log(np.sum(np.sqrt(a * b))) class Bhattacharyya(Distance): + supports_discrete = False + supports_sparse = True + def fit(self, data): return BhattacharyyaModel(self.axis, self.impute) + class BhattacharyyaModel(DistanceModel): def compute_distances(self, x1, x2): if x2 is None: x2 = x1 - return cdist(x1, x2, _bhattacharyya) + if self.axis == 1: + return pairwise_distances(x1, x2, _bhattacharyya) + else: + return pairwise_distances(x1.T, x2.T, _bhattacharyya) class Mahalanobis(Distance): diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py index 907b84a1950..f165d72515b 100644 --- a/Orange/tests/test_distances.py +++ b/Orange/tests/test_distances.py @@ -2,6 +2,7 @@ # pylint: disable=missing-docstring from unittest import TestCase +import unittest import pickle import numpy as np @@ -14,7 +15,8 @@ DiscreteVariable, StringVariable, Instance) from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute, Manhattan, Cosine, - Jaccard, _preprocess, MahalanobisDistance) + Jaccard, _preprocess, MahalanobisDistance, + Bhattacharyya) from Orange.distance.distance import _spearmanr2, _corrcoef2 from Orange.misc import DistMatrix from Orange.tests import named_file, test_filename @@ -91,20 +93,20 @@ def test_from_file(self): self.assertEqual(m.axis, 1) with named_file( - """3 axis=1 symmetric - 0.12 3.45 6.78 - 9.01 2.34 5.67 - 8.90""") as name: + """3 axis=1 symmetric + 0.12 3.45 6.78 + 9.01 2.34 5.67 + 8.90""") as name: m = DistMatrix.from_file(name) np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90], [9.01, 2.34, 0], [8.90, 0, 0]])) with named_file( - """3 row_labels - starič 0.12 3.45 6.78 - aleš 9.01 2.34 5.67 - anže 8.90""", encoding="utf-8""") as name: + """3 row_labels + starič 0.12 3.45 6.78 + aleš 9.01 2.34 5.67 + anže 8.90""", encoding="utf-8""") as name: m = DistMatrix.from_file(name) np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90], [9.01, 2.34, 0], @@ -150,10 +152,10 @@ def assertErrorMsg(content, msg): def test_save(self): with named_file( - """3 axis=1 row_labels - danny 0.12 3.45 6.78 - eve 9.01 2.34 5.67 - frank 8.90""") as name: + """3 axis=1 row_labels + danny 0.12 3.45 6.78 + eve 9.01 2.34 5.67 + frank 8.90""") as name: m = DistMatrix.from_file(name) m.save(name) m = DistMatrix.from_file(name) @@ -167,11 +169,11 @@ def test_save(self): self.assertEqual(m.axis, 1) with named_file( - """3 axis=0 asymmetric col_labels row_labels - ann bert chad - danny 0.12 3.45 6.78 - eve 9.01 2.34 5.67 - frank 8.90 1.23 4.56""") as name: + """3 axis=0 asymmetric col_labels row_labels + ann bert chad + danny 0.12 3.45 6.78 + eve 9.01 2.34 5.67 + frank 8.90 1.23 4.56""") as name: m = DistMatrix.from_file(name) m.save(name) m = DistMatrix.from_file(name) @@ -943,6 +945,31 @@ def test_dimensions(self): mah(xt[0], xt[1]) +class TestBhattacharyya(TestCase): + + @classmethod + def setUpClass(cls): + cls.dist = Bhattacharyya + + def test_dense_array(self): + #Also checks normalization + data = Table('iris') + true_out = np.array([[0, 4.48049499e-04, 2.07117086e-05], + [4.48049499e-04, 0, 3.65052724e-04], + [2.07117086e-05, 3.65052724e-04, 0]]) + np.testing.assert_array_almost_equal(self.dist(data.X[:3]), true_out) + + def test_sparse_array(self): + data = csr_matrix([[0.5, 0.5], [0, 0.5]]) + self.assertAlmostEqual(self.dist(data[0], data[1]), 0.3465735902799726, delta=1e-5) + + def test_columns(self): + data = np.array([[0.5, 0.2], [0.5, 0.8]]) + true_out = np.array([[0, 0.05268025782891318], + [0.05268025782891318, 0]]) + np.testing.assert_array_almost_equal(self.dist(data, axis=0), true_out) + + class TestDistances(TestCase): @classmethod def setUpClass(cls): @@ -982,3 +1009,6 @@ def test_distance_to_instance(self): iris = Table('iris') inst = Instance(iris.domain, np.concatenate((iris[1].x, iris[1].y))) self.assertEqual(Euclidean(iris[1], inst), 0) + +if __name__ == '__main__': + unittest.main() diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index 49191a614f9..9ac036a5a08 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -1,5 +1,6 @@ from AnyQt.QtCore import Qt from scipy.sparse import issparse +from numpy import min as _min import bottleneck as bn import Orange.data @@ -21,7 +22,7 @@ ("Absolute Spearman", distance.SpearmanRAbsolute), ("Pearson", distance.PearsonR), ("Absolute Pearson", distance.PearsonRAbsolute), - ("Hamming2", distance.Hamming), + ("Hamming", distance.Hamming), ('Bhattacharyya', distance.Bhattacharyya) ] @@ -58,6 +59,7 @@ class Error(OWWidget.Error): dense_metric_sparse_data = Msg("{} requires dense data.") distances_memory_error = Msg("Not enough memory") distances_value_error = Msg("Problem in calculation:\n{}") + negative_value_error = Msg("Only non-negative values alowed for Bhattcharyya.") class Warning(OWWidget.Warning): ignoring_discrete = Msg("Ignoring categorical features") @@ -158,6 +160,9 @@ def _fix_missing(): _fix_discrete, _fix_missing, _fix_nonbinary): if not check(): return None + if (METRICS[self.metric_idx][0] == 'Bhattacharyya') and _min(data.X) < 0: + self.Error.negative_value_error() + return None try: if metric.supports_normalization and self.normalized_dist: return metric(data, axis=1 - self.axis, impute=True, diff --git a/Orange/widgets/unsupervised/tests/test_owdistances.py b/Orange/widgets/unsupervised/tests/test_owdistances.py index 245552ade26..0dd8c2a7a70 100644 --- a/Orange/widgets/unsupervised/tests/test_owdistances.py +++ b/Orange/widgets/unsupervised/tests/test_owdistances.py @@ -103,3 +103,12 @@ def test_too_big_array(self): def test_migrates_normalized_dist(self): w = self.create_widget(OWDistances, stored_settings={"metric_idx": 0}) self.assertFalse(w.normalized_dist) + + def test_negative_values_bhattacharyya(self): + self.iris.X[0, 0] *= -1 + for self.widget.metric_idx, (name, _) in enumerate(METRICS): + if name == "Bhattacharyya": + break + self.send_signal(self.widget.Inputs.data, self.iris) + self.assertTrue(self.widget.Error.negative_value_error.is_shown()) + self.iris.X[0, 0] *= -1 From 3e6b5495d2f6e373a16c5ae0e44839c6a0fd789d Mon Sep 17 00:00:00 2001 From: Andreja Kovacic Date: Fri, 25 Oct 2019 14:03:27 +0200 Subject: [PATCH 4/5] Move input validation to distance method --- Orange/distance/distance.py | 22 ++++++++++++++++--- Orange/tests/test_distances.py | 12 +++++++++- Orange/widgets/unsupervised/owdistances.py | 5 ----- .../unsupervised/tests/test_owdistances.py | 6 ++--- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index e7cb4f6e7c7..1be1880692c 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -5,6 +5,7 @@ from scipy import stats from scipy import sparse as sp import sklearn.metrics as skl_metrics +from sklearn.utils import check_array from sklearn.utils.extmath import row_norms, safe_sparse_dot from sklearn.metrics import pairwise_distances @@ -645,16 +646,31 @@ def fit(self, _): return PearsonModel(True, self.axis, self.impute) def _prob_dist(a): - # Makes the vector sum to one, as to mimick probability distribution. + # Makes the vector sum to one, as to mimic probability distribution. return a / np.sum(a) +def non_negative(a): + #Raise an exception for infinities, nans and negative values + try: + check_array(a, accept_sparse=True, accept_large_sparse=True, ensure_2d=False) + except: + raise ValueError("Bhattcharyya distance requires non-negative values") + if sp.issparse(a): + if a.min() < 0: + raise ValueError("Bhattcharyya distance requires non-negative values") + return + if min(a) < 0: + raise ValueError("Bhattcharyya distance requires non-negative values") + def _bhattacharyya(a, b): # not a real metric, does not obey triangle inequality + non_negative(a) + non_negative(b) a = _prob_dist(a) b = _prob_dist(b) if sp.issparse(a): - return - np.log(np.sum(np.sqrt(a.multiply(b)))) - return - np.log(np.sum(np.sqrt(a * b))) + return -np.log(np.sum(np.sqrt(a.multiply(b)))) + return -np.log(np.sum(np.sqrt(a * b))) class Bhattacharyya(Distance): supports_discrete = False diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py index f165d72515b..9ab7a999a80 100644 --- a/Orange/tests/test_distances.py +++ b/Orange/tests/test_distances.py @@ -962,13 +962,23 @@ def test_dense_array(self): def test_sparse_array(self): data = csr_matrix([[0.5, 0.5], [0, 0.5]]) self.assertAlmostEqual(self.dist(data[0], data[1]), 0.3465735902799726, delta=1e-5) - + def test_columns(self): data = np.array([[0.5, 0.2], [0.5, 0.8]]) true_out = np.array([[0, 0.05268025782891318], [0.05268025782891318, 0]]) np.testing.assert_array_almost_equal(self.dist(data, axis=0), true_out) + def test_negative_input(self): + a = np.array([0, np.nan]) + b = np.array([1, 1]) + self.assertRaises(ValueError, self.dist, a, b) + a[1] = -1 + self.assertRaises(ValueError, self.dist, a, b) + a = csr_matrix(a) + b = csr_matrix(b) + self.assertRaises(ValueError, self.dist, a, b) + class TestDistances(TestCase): @classmethod diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py index 9ac036a5a08..153cd78b56f 100644 --- a/Orange/widgets/unsupervised/owdistances.py +++ b/Orange/widgets/unsupervised/owdistances.py @@ -1,6 +1,5 @@ from AnyQt.QtCore import Qt from scipy.sparse import issparse -from numpy import min as _min import bottleneck as bn import Orange.data @@ -59,7 +58,6 @@ class Error(OWWidget.Error): dense_metric_sparse_data = Msg("{} requires dense data.") distances_memory_error = Msg("Not enough memory") distances_value_error = Msg("Problem in calculation:\n{}") - negative_value_error = Msg("Only non-negative values alowed for Bhattcharyya.") class Warning(OWWidget.Warning): ignoring_discrete = Msg("Ignoring categorical features") @@ -160,9 +158,6 @@ def _fix_missing(): _fix_discrete, _fix_missing, _fix_nonbinary): if not check(): return None - if (METRICS[self.metric_idx][0] == 'Bhattacharyya') and _min(data.X) < 0: - self.Error.negative_value_error() - return None try: if metric.supports_normalization and self.normalized_dist: return metric(data, axis=1 - self.axis, impute=True, diff --git a/Orange/widgets/unsupervised/tests/test_owdistances.py b/Orange/widgets/unsupervised/tests/test_owdistances.py index 0dd8c2a7a70..2362a6bbadc 100644 --- a/Orange/widgets/unsupervised/tests/test_owdistances.py +++ b/Orange/widgets/unsupervised/tests/test_owdistances.py @@ -106,9 +106,9 @@ def test_migrates_normalized_dist(self): def test_negative_values_bhattacharyya(self): self.iris.X[0, 0] *= -1 - for self.widget.metric_idx, (name, _) in enumerate(METRICS): - if name == "Bhattacharyya": + for self.widget.metric_idx, (_, metric) in enumerate(METRICS): + if metric == distance.Bhattacharyya: break self.send_signal(self.widget.Inputs.data, self.iris) - self.assertTrue(self.widget.Error.negative_value_error.is_shown()) + self.assertTrue(self.widget.Error.distances_value_error.is_shown()) self.iris.X[0, 0] *= -1 From 69083d11a1316ae8dcae1a5ba6de52db43e842a9 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 1 Nov 2019 19:47:11 +0100 Subject: [PATCH 5/5] distance.check_non_negative: Simplify --- Orange/distance/distance.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index 1be1880692c..e8156e5ce38 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -645,33 +645,31 @@ class PearsonRAbsolute(CorrelationDistance): def fit(self, _): return PearsonModel(True, self.axis, self.impute) + def _prob_dist(a): # Makes the vector sum to one, as to mimic probability distribution. return a / np.sum(a) -def non_negative(a): - #Raise an exception for infinities, nans and negative values - try: - check_array(a, accept_sparse=True, accept_large_sparse=True, ensure_2d=False) - except: - raise ValueError("Bhattcharyya distance requires non-negative values") - if sp.issparse(a): - if a.min() < 0: - raise ValueError("Bhattcharyya distance requires non-negative values") - return - if min(a) < 0: + +def check_non_negative(a): + # Raise an exception for infinities, nans and negative values + check_array(a, + accept_sparse=True, accept_large_sparse=True, ensure_2d=False) + if a.min() < 0: raise ValueError("Bhattcharyya distance requires non-negative values") + def _bhattacharyya(a, b): # not a real metric, does not obey triangle inequality - non_negative(a) - non_negative(b) + check_non_negative(a) + check_non_negative(b) a = _prob_dist(a) b = _prob_dist(b) if sp.issparse(a): return -np.log(np.sum(np.sqrt(a.multiply(b)))) return -np.log(np.sum(np.sqrt(a * b))) + class Bhattacharyya(Distance): supports_discrete = False supports_sparse = True