diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index bd270de9806..40bb510eced 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -418,8 +418,6 @@ def __init__(self, absolute, axis=1, impute=False): self.absolute = absolute def compute_distances(self, x1, x2): - if x2 is None: - x2 = x1 rho = self.compute_correlation(x1, x2) if self.absolute: return (1. - np.abs(rho)) / 2. @@ -427,16 +425,121 @@ def compute_distances(self, x1, x2): return (1. - rho) / 2. def compute_correlation(self, x1, x2): - pass + raise NotImplementedError() class SpearmanModel(CorrelationDistanceModel): def compute_correlation(self, x1, x2): - rho = stats.spearmanr(x1, x2, axis=self.axis)[0] - if isinstance(rho, np.float): - return np.array([[rho]]) - slc = x1.shape[1 - self.axis] - return rho[:slc, slc:] + if x2 is None: + n1 = x1.shape[1 - self.axis] + if n1 == 2: + # Special case to properly fill degenerate self correlations + # (nan, inf on the diagonals) + rho = stats.spearmanr(x1, x1, axis=self.axis)[0] + assert rho.shape == (4, 4) + rho = rho[:2, :2].copy() + else: + # scalar if n1 == 1 + rho = stats.spearmanr(x1, axis=self.axis)[0] + return np.atleast_2d(rho) + else: + return _spearmanr2(x1, x2, axis=self.axis) + + +def _spearmanr2(a, b, axis=0): + """ + Compute all pairwise spearman rank moment correlations between rows + or columns of a and b + + Parameters + ---------- + a : (N, M) numpy.ndarray + The input cases a. + b : (J, K) numpy.ndarray + The input cases b. In case of axis == 0: J must equal N; + otherwise if axis == 1 then K must equal M. + axis : int + If 0 the correlation are computed between a and b's columns. + Otherwise if 1 the correlations are computed between rows. + + Returns + ------- + cor : (N, J) or (M, K) nd.array + If axis == 0 then (N, J) matrix of correlations between a x b columns + else a (N, J) matrix of correlations between a x b rows. + + See Also + -------- + scipy.stats.spearmanr + """ + a, b = np.atleast_2d(a, b) + assert a.shape[axis] == b.shape[axis] + ar = np.apply_along_axis(stats.rankdata, axis, a) + br = np.apply_along_axis(stats.rankdata, axis, b) + + return _corrcoef2(ar, br, axis=axis) + + +def _corrcoef2(a, b, axis=0): + """ + Compute all pairwise Pearson product-moment correlation coefficients + between rows or columns of a and b + + Parameters + ---------- + a : (N, M) numpy.ndarray + The input cases a. + b : (J, K) numpy.ndarray + The input cases b. In case of axis == 0: J must equal N; + otherwise if axis == 1 then K must equal M. + axis : int + If 0 the correlation are computed between a and b's columns. + Otherwise if 1 the correlations are computed between rows. + + Returns + ------- + cor : (N, J) or (M, K) nd.array + If axis == 0 then (N, J) matrix of correlations between a x b columns + else a (N, J) matrix of correlations between a x b rows. + + See Also + -------- + numpy.corrcoef + """ + a, b = np.atleast_2d(a, b) + if not (axis == 0 or axis == 1): + raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis)) + + mean_a = np.mean(a, axis=axis, keepdims=True) + mean_b = np.mean(b, axis=axis, keepdims=True) + assert a.shape[axis] == b.shape[axis] + + n = a.shape[1 - axis] + m = b.shape[1 - axis] + + a = a - mean_a + b = b - mean_b + + if axis == 0: + C = a.T.dot(b) + assert C.shape == (n, m) + elif axis == 1: + C = a.dot(b.T) + assert C.shape == (n, m) + + ss_a = np.sum(a ** 2, axis=axis, keepdims=True) + ss_b = np.sum(b ** 2, axis=axis, keepdims=True) + + if axis == 0: + ss_a = ss_a.T + else: + ss_b = ss_b.T + + assert ss_a.shape == (n, 1) + assert ss_b.shape == (1, m) + C /= np.sqrt(ss_a) + C /= np.sqrt(ss_b) + return C class CorrelationDistance(Distance): @@ -455,10 +558,11 @@ def fit(self, _): class PearsonModel(CorrelationDistanceModel): def compute_correlation(self, x1, x2): - if self.axis == 0: - x1 = x1.T - x2 = x2.T - return np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1]) + if x2 is None: + c = np.corrcoef(x1, rowvar=self.axis == 1) + return np.atleast_2d(c) + else: + return _corrcoef2(x1, x2, axis=self.axis) class PearsonR(CorrelationDistance): diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py index 46693b2aafc..3864a188586 100644 --- a/Orange/tests/test_distances.py +++ b/Orange/tests/test_distances.py @@ -6,6 +6,8 @@ import numpy as np import scipy +import scipy.spatial +import scipy.stats from scipy.sparse import csr_matrix from Orange.data import (Table, Domain, ContinuousVariable, @@ -13,6 +15,7 @@ from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute, Manhattan, Cosine, Jaccard, _preprocess, MahalanobisDistance) +from Orange.distance.distance import _spearmanr2, _corrcoef2 from Orange.misc import DistMatrix from Orange.tests import named_file, test_filename from Orange.util import OrangeDeprecationWarning @@ -598,6 +601,30 @@ def test_spearmanr_distance_numpy(self): [0.3833333], [0.]])) + def test_spearmanr2(self): + # Test that _spearnmanr2 returns the same result that stats.spearmanr + # would + n, m = tuple(np.random.randint(2, 5, size=2)) + mean = np.random.uniform(-1, 1, size=m) + cov = np.random.uniform(0, 1./m, size=(m, m)) + cov = (cov + cov.T) / 2 + cov.flat[::m + 1] = 1.0 + X1 = np.random.multivariate_normal(mean, cov, size=n) + X2 = np.random.multivariate_normal(mean, cov, size=n) + expected = scipy.stats.spearmanr(X1, X2, axis=1)[0][:n, n:] + np.testing.assert_almost_equal( + _spearmanr2(X1, X2, axis=1), + expected, + decimal=9 + ) + + expected = scipy.stats.spearmanr(X1, X2, axis=0)[0][:m, m:] + np.testing.assert_almost_equal( + _spearmanr2(X1, X2, axis=0), + expected, + decimal=9, + ) + # noinspection PyTypeChecker class TestSpearmanRAbsolute(TestCase): @@ -752,6 +779,32 @@ def test_pearsonr_distance_numpy(self): [0.32783865], [0.]])) + def test_corrcoef2(self): + # Test that _corrcoef2 returns the same result that np.corrcoef would + n, m = tuple(np.random.randint(2, 5, size=2)) + mean = np.random.uniform(-1, 1, size=m) + cov = np.random.uniform(0, 1./m, size=(m, m)) + cov = (cov + cov.T) / 2 + cov.flat[::m + 1] = 1.0 + X1 = np.random.multivariate_normal(mean, cov, size=n) + X2 = np.random.multivariate_normal(mean, cov, size=n) + expected = np.corrcoef(X1, X2, rowvar=True)[:n, n:] + np.testing.assert_almost_equal( + _corrcoef2(X1, X2, axis=1), + expected, + decimal=9 + ) + + expected = np.corrcoef(X1, X2, rowvar=False)[:m, m:] + np.testing.assert_almost_equal( + _corrcoef2(X1, X2, axis=0), + expected, + decimal=9, + ) + + with self.assertRaises(ValueError): + _corrcoef2(X1, X2, axis=10) + # noinspection PyTypeChecker class TestPearsonRAbsolute(TestCase):