Merge pull request #4111 from AndrejaKovacic/bhatthacharayya

[ENH] Bhatthacharayya distance
biolab · Nov 1, 2019 · 0c0a1e9 · 0c0a1e9
2 parents 0ab648b + 69083d1
commit 0c0a1e9
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 21 deletions.
diff --git a/Orange/distance/__init__.py b/Orange/distance/__init__.py
@@ -1,7 +1,7 @@
 from .distance import (Distance, DistanceModel,
                        Euclidean, Manhattan, Cosine, Jaccard,
                        SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute,
-                       Mahalanobis, MahalanobisDistance, Hamming)
+                       Mahalanobis, MahalanobisDistance, Hamming, Bhattacharyya)
 
 from .base import (
     _preprocess, remove_discrete_features, remove_nonbinary_features, impute)
diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
@@ -5,6 +5,7 @@
 from scipy import stats
 from scipy import sparse as sp
 import sklearn.metrics as skl_metrics
+from sklearn.utils import check_array
 from sklearn.utils.extmath import row_norms, safe_sparse_dot
 from sklearn.metrics import pairwise_distances
 
@@ -645,6 +646,49 @@ def fit(self, _):
         return PearsonModel(True, self.axis, self.impute)
 
 
+def _prob_dist(a):
+    # Makes the vector sum to one, as to mimic probability distribution.
+    return a / np.sum(a)
+
+
+def check_non_negative(a):
+    # Raise an exception for infinities, nans and negative values
+    check_array(a,
+                accept_sparse=True, accept_large_sparse=True, ensure_2d=False)
+    if a.min() < 0:
+        raise ValueError("Bhattcharyya distance requires non-negative values")
+
+
+def _bhattacharyya(a, b):
+    # not a real metric, does not obey triangle inequality
+    check_non_negative(a)
+    check_non_negative(b)
+    a = _prob_dist(a)
+    b = _prob_dist(b)
+    if sp.issparse(a):
+        return -np.log(np.sum(np.sqrt(a.multiply(b))))
+    return -np.log(np.sum(np.sqrt(a * b)))
+
+
+class Bhattacharyya(Distance):
+    supports_discrete = False
+    supports_sparse = True
+
+    def fit(self, data):
+        return BhattacharyyaModel(self.axis, self.impute)
+
+
+class BhattacharyyaModel(DistanceModel):
+
+    def compute_distances(self, x1, x2):
+        if x2 is None:
+            x2 = x1
+        if self.axis == 1:
+            return pairwise_distances(x1, x2, _bhattacharyya)
+        else:
+            return pairwise_distances(x1.T, x2.T, _bhattacharyya)
+
+
 class Mahalanobis(Distance):
     supports_sparse = False
     supports_missing = False

diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py
@@ -2,6 +2,7 @@
 # pylint: disable=missing-docstring
 
 from unittest import TestCase
+import unittest
 import pickle
 
 import numpy as np
@@ -14,7 +15,8 @@
                          DiscreteVariable, StringVariable, Instance)
 from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute,
                              PearsonR, PearsonRAbsolute, Manhattan, Cosine,
-                             Jaccard, _preprocess, MahalanobisDistance)
+                             Jaccard, _preprocess, MahalanobisDistance, 
+                             Bhattacharyya)
 from Orange.distance.distance import _spearmanr2, _corrcoef2
 from Orange.misc import DistMatrix
 from Orange.tests import named_file, test_filename
@@ -91,20 +93,20 @@ def test_from_file(self):
             self.assertEqual(m.axis, 1)
 
         with named_file(
-            """3 axis=1 symmetric
-                0.12	3.45	6.78
-                9.01	2.34	5.67
-                8.90""") as name:
+                """3 axis=1 symmetric
+                    0.12	3.45	6.78
+                    9.01	2.34	5.67
+                    8.90""") as name:
             m = DistMatrix.from_file(name)
         np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90],
                                                     [9.01, 2.34, 0],
                                                     [8.90, 0, 0]]))
 
         with named_file(
-            """3 row_labels
-                starič	0.12	3.45	6.78
-                aleš	9.01	2.34	5.67
-                anže	8.90""", encoding="utf-8""") as name:
+                """3 row_labels
+                    starič	0.12	3.45	6.78
+                    aleš	9.01	2.34	5.67
+                    anže	8.90""", encoding="utf-8""") as name:
             m = DistMatrix.from_file(name)
             np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90],
                                                         [9.01, 2.34, 0],
@@ -150,10 +152,10 @@ def assertErrorMsg(content, msg):
 
     def test_save(self):
         with named_file(
-            """3 axis=1 row_labels
-                danny	0.12	3.45	6.78
-                eve 	9.01	2.34	5.67
-                frank	8.90""") as name:
+                """3 axis=1 row_labels
+                    danny	0.12	3.45	6.78
+                    eve 	9.01	2.34	5.67
+                    frank	8.90""") as name:
             m = DistMatrix.from_file(name)
             m.save(name)
             m = DistMatrix.from_file(name)
@@ -167,11 +169,11 @@ def test_save(self):
             self.assertEqual(m.axis, 1)
 
         with named_file(
-            """3 axis=0 asymmetric col_labels row_labels
-                         ann	bert	chad
-                danny	0.12	3.45	6.78
-                  eve	9.01	2.34	5.67
-                frank	8.90	1.23	4.56""") as name:
+                """3 axis=0 asymmetric col_labels row_labels
+                             ann	bert	chad
+                    danny	0.12	3.45	6.78
+                      eve	9.01	2.34	5.67
+                    frank	8.90	1.23	4.56""") as name:
             m = DistMatrix.from_file(name)
             m.save(name)
             m = DistMatrix.from_file(name)
@@ -943,6 +945,41 @@ def test_dimensions(self):
         mah(xt[0], xt[1])
 
 
+class TestBhattacharyya(TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.dist = Bhattacharyya
+
+    def test_dense_array(self):
+        #Also checks normalization
+        data = Table('iris')
+        true_out = np.array([[0, 4.48049499e-04, 2.07117086e-05],
+                             [4.48049499e-04, 0, 3.65052724e-04],
+                             [2.07117086e-05, 3.65052724e-04, 0]])
+        np.testing.assert_array_almost_equal(self.dist(data.X[:3]), true_out)
+
+    def test_sparse_array(self):
+        data = csr_matrix([[0.5, 0.5], [0, 0.5]])
+        self.assertAlmostEqual(self.dist(data[0], data[1]), 0.3465735902799726, delta=1e-5)
+
+    def test_columns(self):
+        data = np.array([[0.5, 0.2], [0.5, 0.8]])
+        true_out = np.array([[0, 0.05268025782891318],
+                             [0.05268025782891318, 0]])
+        np.testing.assert_array_almost_equal(self.dist(data, axis=0), true_out)
+
+    def test_negative_input(self):
+        a = np.array([0, np.nan])
+        b = np.array([1, 1])
+        self.assertRaises(ValueError, self.dist, a, b)
+        a[1] = -1
+        self.assertRaises(ValueError, self.dist, a, b)
+        a = csr_matrix(a)
+        b = csr_matrix(b)
+        self.assertRaises(ValueError, self.dist, a, b)
+
+
 class TestDistances(TestCase):
     @classmethod
     def setUpClass(cls):
@@ -982,3 +1019,6 @@ def test_distance_to_instance(self):
         iris = Table('iris')
         inst = Instance(iris.domain, np.concatenate((iris[1].x, iris[1].y)))
         self.assertEqual(Euclidean(iris[1], inst), 0)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py
@@ -21,7 +21,8 @@
     ("Absolute Spearman", distance.SpearmanRAbsolute),
     ("Pearson", distance.PearsonR),
     ("Absolute Pearson", distance.PearsonRAbsolute),
-    ("Hamming", distance.Hamming)
+    ("Hamming", distance.Hamming),
+    ('Bhattacharyya', distance.Bhattacharyya)
 ]
 
 

diff --git a/Orange/widgets/unsupervised/tests/test_owdistances.py b/Orange/widgets/unsupervised/tests/test_owdistances.py
@@ -103,3 +103,12 @@ def test_too_big_array(self):
     def test_migrates_normalized_dist(self):
         w = self.create_widget(OWDistances, stored_settings={"metric_idx": 0})
         self.assertFalse(w.normalized_dist)
+
+    def test_negative_values_bhattacharyya(self):
+        self.iris.X[0, 0] *= -1
+        for self.widget.metric_idx, (_, metric) in enumerate(METRICS):
+            if metric == distance.Bhattacharyya:
+                break
+        self.send_signal(self.widget.Inputs.data, self.iris)
+        self.assertTrue(self.widget.Error.distances_value_error.is_shown())
+        self.iris.X[0, 0] *= -1
diff --git a/doc/visual-programming/source/widgets/unsupervised/distances.md b/doc/visual-programming/source/widgets/unsupervised/distances.md
@@ -32,8 +32,9 @@ Distances work well with Orange add-ons, too. The distance matrix can be fed to
    - [Pearson](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the values, remapped as a distance in a [0, 1] interval)
    - [Pearson absolute](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the absolute values, remapped as a distance in a [0, 1] interval)
    - [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) (the number of features at which the corresponding values are different)
+   - [Bhattacharyya distance](https://en.wikipedia.org/wiki/Bhattacharyya_distance) (Similarity between two probability distributions, not a real distance as it doesn't obey triangle inequality.)
 
-   Normalize the features. Normalization is always done column-wise.
+   Normalize the features. Normalization is always done column-wise. Values are zero centered and scaled.
    In case of missing values, the widget automatically imputes the average value of the row or the column.
    The widget works for both numeric and categorical data. In case of categorical data, the distance is 0 if the two values are the same ('green' and 'green') and 1 if they are not ('green' and 'blue').
 3. Tick *Apply Automatically* to automatically commit changes to other widgets. Alternatively, press '*Apply*'.