Skip to content

Commit

Permalink
Merge pull request #4111 from AndrejaKovacic/bhatthacharayya
Browse files Browse the repository at this point in the history
[ENH] Bhatthacharayya distance
  • Loading branch information
janezd authored Nov 1, 2019
2 parents 0ab648b + 69083d1 commit 0c0a1e9
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Orange/distance/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .distance import (Distance, DistanceModel,
Euclidean, Manhattan, Cosine, Jaccard,
SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute,
Mahalanobis, MahalanobisDistance, Hamming)
Mahalanobis, MahalanobisDistance, Hamming, Bhattacharyya)

from .base import (
_preprocess, remove_discrete_features, remove_nonbinary_features, impute)
44 changes: 44 additions & 0 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scipy import stats
from scipy import sparse as sp
import sklearn.metrics as skl_metrics
from sklearn.utils import check_array
from sklearn.utils.extmath import row_norms, safe_sparse_dot
from sklearn.metrics import pairwise_distances

Expand Down Expand Up @@ -645,6 +646,49 @@ def fit(self, _):
return PearsonModel(True, self.axis, self.impute)


def _prob_dist(a):
# Makes the vector sum to one, as to mimic probability distribution.
return a / np.sum(a)


def check_non_negative(a):
# Raise an exception for infinities, nans and negative values
check_array(a,
accept_sparse=True, accept_large_sparse=True, ensure_2d=False)
if a.min() < 0:
raise ValueError("Bhattcharyya distance requires non-negative values")


def _bhattacharyya(a, b):
# not a real metric, does not obey triangle inequality
check_non_negative(a)
check_non_negative(b)
a = _prob_dist(a)
b = _prob_dist(b)
if sp.issparse(a):
return -np.log(np.sum(np.sqrt(a.multiply(b))))
return -np.log(np.sum(np.sqrt(a * b)))


class Bhattacharyya(Distance):
supports_discrete = False
supports_sparse = True

def fit(self, data):
return BhattacharyyaModel(self.axis, self.impute)


class BhattacharyyaModel(DistanceModel):

def compute_distances(self, x1, x2):
if x2 is None:
x2 = x1
if self.axis == 1:
return pairwise_distances(x1, x2, _bhattacharyya)
else:
return pairwise_distances(x1.T, x2.T, _bhattacharyya)


class Mahalanobis(Distance):
supports_sparse = False
supports_missing = False
Expand Down
76 changes: 58 additions & 18 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# pylint: disable=missing-docstring

from unittest import TestCase
import unittest
import pickle

import numpy as np
Expand All @@ -14,7 +15,8 @@
DiscreteVariable, StringVariable, Instance)
from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute,
PearsonR, PearsonRAbsolute, Manhattan, Cosine,
Jaccard, _preprocess, MahalanobisDistance)
Jaccard, _preprocess, MahalanobisDistance,
Bhattacharyya)
from Orange.distance.distance import _spearmanr2, _corrcoef2
from Orange.misc import DistMatrix
from Orange.tests import named_file, test_filename
Expand Down Expand Up @@ -91,20 +93,20 @@ def test_from_file(self):
self.assertEqual(m.axis, 1)

with named_file(
"""3 axis=1 symmetric
0.12 3.45 6.78
9.01 2.34 5.67
8.90""") as name:
"""3 axis=1 symmetric
0.12 3.45 6.78
9.01 2.34 5.67
8.90""") as name:
m = DistMatrix.from_file(name)
np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90],
[9.01, 2.34, 0],
[8.90, 0, 0]]))

with named_file(
"""3 row_labels
starič 0.12 3.45 6.78
aleš 9.01 2.34 5.67
anže 8.90""", encoding="utf-8""") as name:
"""3 row_labels
starič 0.12 3.45 6.78
aleš 9.01 2.34 5.67
anže 8.90""", encoding="utf-8""") as name:
m = DistMatrix.from_file(name)
np.testing.assert_almost_equal(m, np.array([[0.12, 9.01, 8.90],
[9.01, 2.34, 0],
Expand Down Expand Up @@ -150,10 +152,10 @@ def assertErrorMsg(content, msg):

def test_save(self):
with named_file(
"""3 axis=1 row_labels
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90""") as name:
"""3 axis=1 row_labels
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90""") as name:
m = DistMatrix.from_file(name)
m.save(name)
m = DistMatrix.from_file(name)
Expand All @@ -167,11 +169,11 @@ def test_save(self):
self.assertEqual(m.axis, 1)

with named_file(
"""3 axis=0 asymmetric col_labels row_labels
ann bert chad
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90 1.23 4.56""") as name:
"""3 axis=0 asymmetric col_labels row_labels
ann bert chad
danny 0.12 3.45 6.78
eve 9.01 2.34 5.67
frank 8.90 1.23 4.56""") as name:
m = DistMatrix.from_file(name)
m.save(name)
m = DistMatrix.from_file(name)
Expand Down Expand Up @@ -943,6 +945,41 @@ def test_dimensions(self):
mah(xt[0], xt[1])


class TestBhattacharyya(TestCase):

@classmethod
def setUpClass(cls):
cls.dist = Bhattacharyya

def test_dense_array(self):
#Also checks normalization
data = Table('iris')
true_out = np.array([[0, 4.48049499e-04, 2.07117086e-05],
[4.48049499e-04, 0, 3.65052724e-04],
[2.07117086e-05, 3.65052724e-04, 0]])
np.testing.assert_array_almost_equal(self.dist(data.X[:3]), true_out)

def test_sparse_array(self):
data = csr_matrix([[0.5, 0.5], [0, 0.5]])
self.assertAlmostEqual(self.dist(data[0], data[1]), 0.3465735902799726, delta=1e-5)

def test_columns(self):
data = np.array([[0.5, 0.2], [0.5, 0.8]])
true_out = np.array([[0, 0.05268025782891318],
[0.05268025782891318, 0]])
np.testing.assert_array_almost_equal(self.dist(data, axis=0), true_out)

def test_negative_input(self):
a = np.array([0, np.nan])
b = np.array([1, 1])
self.assertRaises(ValueError, self.dist, a, b)
a[1] = -1
self.assertRaises(ValueError, self.dist, a, b)
a = csr_matrix(a)
b = csr_matrix(b)
self.assertRaises(ValueError, self.dist, a, b)


class TestDistances(TestCase):
@classmethod
def setUpClass(cls):
Expand Down Expand Up @@ -982,3 +1019,6 @@ def test_distance_to_instance(self):
iris = Table('iris')
inst = Instance(iris.domain, np.concatenate((iris[1].x, iris[1].y)))
self.assertEqual(Euclidean(iris[1], inst), 0)

if __name__ == '__main__':
unittest.main()
3 changes: 2 additions & 1 deletion Orange/widgets/unsupervised/owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
("Absolute Spearman", distance.SpearmanRAbsolute),
("Pearson", distance.PearsonR),
("Absolute Pearson", distance.PearsonRAbsolute),
("Hamming", distance.Hamming)
("Hamming", distance.Hamming),
('Bhattacharyya', distance.Bhattacharyya)
]


Expand Down
9 changes: 9 additions & 0 deletions Orange/widgets/unsupervised/tests/test_owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,12 @@ def test_too_big_array(self):
def test_migrates_normalized_dist(self):
w = self.create_widget(OWDistances, stored_settings={"metric_idx": 0})
self.assertFalse(w.normalized_dist)

def test_negative_values_bhattacharyya(self):
self.iris.X[0, 0] *= -1
for self.widget.metric_idx, (_, metric) in enumerate(METRICS):
if metric == distance.Bhattacharyya:
break
self.send_signal(self.widget.Inputs.data, self.iris)
self.assertTrue(self.widget.Error.distances_value_error.is_shown())
self.iris.X[0, 0] *= -1
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ Distances work well with Orange add-ons, too. The distance matrix can be fed to
- [Pearson](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the values, remapped as a distance in a [0, 1] interval)
- [Pearson absolute](https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient) (linear correlation between the absolute values, remapped as a distance in a [0, 1] interval)
- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) (the number of features at which the corresponding values are different)
- [Bhattacharyya distance](https://en.wikipedia.org/wiki/Bhattacharyya_distance) (Similarity between two probability distributions, not a real distance as it doesn't obey triangle inequality.)

Normalize the features. Normalization is always done column-wise.
Normalize the features. Normalization is always done column-wise. Values are zero centered and scaled.
In case of missing values, the widget automatically imputes the average value of the row or the column.
The widget works for both numeric and categorical data. In case of categorical data, the distance is 0 if the two values are the same ('green' and 'green') and 1 if they are not ('green' and 'blue').
3. Tick *Apply Automatically* to automatically commit changes to other widgets. Alternatively, press '*Apply*'.
Expand Down

0 comments on commit 0c0a1e9

Please sign in to comment.