From b5264000f20db429a3bd371ec66178c9fb49e5ce Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Thu, 4 Jan 2018 14:27:47 +0100 Subject: [PATCH 1/3] distance: Speed and memory optimization * Use numpy.corrcoef in PearsonR * Optimize PearsonR/SpearmanR when computing pairwise distances on a single input table --- Orange/distance/distance.py | 46 ++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index bd270de9806..3c1b7bb12d8 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -418,8 +418,6 @@ def __init__(self, absolute, axis=1, impute=False): self.absolute = absolute def compute_distances(self, x1, x2): - if x2 is None: - x2 = x1 rho = self.compute_correlation(x1, x2) if self.absolute: return (1. - np.abs(rho)) / 2. @@ -432,11 +430,29 @@ def compute_correlation(self, x1, x2): class SpearmanModel(CorrelationDistanceModel): def compute_correlation(self, x1, x2): - rho = stats.spearmanr(x1, x2, axis=self.axis)[0] - if isinstance(rho, np.float): - return np.array([[rho]]) - slc = x1.shape[1 - self.axis] - return rho[:slc, slc:] + n1 = x1.shape[1 - self.axis] + n2 = x2.shape[1 - self.axis] if x2 is not None else 0 + if x2 is None: + if n1 == 2: + # Special case to properly fill degenerate self correlations + # (nan, inf on the diagonals) + rho = stats.spearmanr(x1, x1, axis=self.axis)[0] + assert rho.shape == (4, 4) + rho = rho[:2, :2].copy() + else: + # scalar if n1 == 1 + rho = stats.spearmanr(x1, axis=self.axis)[0] + return np.atleast_2d(rho) + else: + # this computes too much (most of it is thrown away) + rho = stats.spearmanr(x1, x2, axis=self.axis)[0] + if np.isscalar(rho): + # scalar if n1 + n2 <= 2 + assert n1 + n2 <= 2 + return np.atleast_2d(rho) + else: + assert rho.shape == (n1 + n2, n1 + n2) + return rho[:n1, n1:].copy() class CorrelationDistance(Distance): @@ -455,10 +471,18 @@ def fit(self, _): class PearsonModel(CorrelationDistanceModel): def compute_correlation(self, x1, x2): - if self.axis == 0: - x1 = x1.T - x2 = x2.T - return np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1]) + if x2 is None: + c = np.corrcoef(x1, rowvar=self.axis == 1) + return np.atleast_2d(c) + else: + # this computes too much (most of it is thrown away)` + c = np.corrcoef(x1, x2, rowvar=self.axis == 1) + if np.isscalar(c): + return np.atleast_2d(c) + n1 = x1.shape[1 - self.axis] + n2 = x2.shape[1 - self.axis] + assert c.shape[0] == c.shape[1] == n1 + n2 + return c[:n1, n1:].copy() class PearsonR(CorrelationDistance): From 3d10937eeb0ec39ef004aca2793d27973fae45d2 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Thu, 4 Jan 2018 16:35:22 +0100 Subject: [PATCH 2/3] distances: Optimize PearsonR/SpearmanR ... ... for the case where computing distances from two tables. --- Orange/distance/distance.py | 119 ++++++++++++++++++++++++++++++------ 1 file changed, 99 insertions(+), 20 deletions(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index 3c1b7bb12d8..c49045dcf1c 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -425,14 +425,13 @@ def compute_distances(self, x1, x2): return (1. - rho) / 2. def compute_correlation(self, x1, x2): - pass + raise NotImplementedError() class SpearmanModel(CorrelationDistanceModel): def compute_correlation(self, x1, x2): - n1 = x1.shape[1 - self.axis] - n2 = x2.shape[1 - self.axis] if x2 is not None else 0 if x2 is None: + n1 = x1.shape[1 - self.axis] if n1 == 2: # Special case to properly fill degenerate self correlations # (nan, inf on the diagonals) @@ -444,15 +443,102 @@ def compute_correlation(self, x1, x2): rho = stats.spearmanr(x1, axis=self.axis)[0] return np.atleast_2d(rho) else: - # this computes too much (most of it is thrown away) - rho = stats.spearmanr(x1, x2, axis=self.axis)[0] - if np.isscalar(rho): - # scalar if n1 + n2 <= 2 - assert n1 + n2 <= 2 - return np.atleast_2d(rho) - else: - assert rho.shape == (n1 + n2, n1 + n2) - return rho[:n1, n1:].copy() + return _spearmanr2(x1, x2, axis=self.axis) + + +def _spearmanr2(a, b, axis=0): + """ + Compute all pairwise spearman rank moment correlations between rows + or columns of a and b + + Parameters + ---------- + a : (N, M) numpy.ndarray + The input cases a. + b : (J, K) numpy.ndarray + The input cases b. In case of axis == 0: J must equal N; + otherwise if axis == 1 then K must equal M. + axis : int + If 0 the correlation are computed between a and b's columns. + Otherwise if 1 the correlations are computed between rows. + + Returns + ------- + cor : (N, J) or (M, K) nd.array + If axis == 0 then (N, J) matrix of correlations between a x b columns + else a (N, J) matrix of correlations between a x b rows. + + See Also + -------- + scipy.stats.spearmanr + """ + a, b = np.atleast_2d(a, b) + assert a.shape[axis] == b.shape[axis] + ar = np.apply_along_axis(stats.rankdata, axis, a) + br = np.apply_along_axis(stats.rankdata, axis, b) + + return _corrcoef2(ar, br, axis=axis) + + +def _corrcoef2(a, b, axis=0): + """ + Compute all pairwise Pearson product-moment correlation coefficients + between rows or columns of a and b + + Parameters + ---------- + a : (N, M) numpy.ndarray + The input cases a. + b : (J, K) numpy.ndarray + The input cases b. In case of axis == 0: J must equal N; + otherwise if axis == 1 then K must equal M. + axis : int + If 0 the correlation are computed between a and b's columns. + Otherwise if 1 the correlations are computed between rows. + + Returns + ------- + cor : (N, J) or (M, K) nd.array + If axis == 0 then (N, J) matrix of correlations between a x b columns + else a (N, J) matrix of correlations between a x b rows. + + See Also + -------- + numpy.corrcoef + """ + a, b = np.atleast_2d(a, b) + mean_a = np.mean(a, axis=axis, keepdims=True) + mean_b = np.mean(b, axis=axis, keepdims=True) + assert a.shape[axis] == b.shape[axis] + + n = a.shape[1 - axis] + m = b.shape[1 - axis] + + a = a - mean_a + b = b - mean_b + + if axis == 0: + C = a.T.dot(b) + assert C.shape == (n, m) + elif axis == 1: + C = a.dot(b.T) + assert C.shape == (n, m) + else: + raise ValueError() + + ss_a = np.sum(a ** 2, axis=axis, keepdims=True) + ss_b = np.sum(b ** 2, axis=axis, keepdims=True) + + if axis == 0: + ss_a = ss_a.T + else: + ss_b = ss_b.T + + assert ss_a.shape == (n, 1) + assert ss_b.shape == (1, m) + C /= np.sqrt(ss_a) + C /= np.sqrt(ss_b) + return C class CorrelationDistance(Distance): @@ -475,14 +561,7 @@ def compute_correlation(self, x1, x2): c = np.corrcoef(x1, rowvar=self.axis == 1) return np.atleast_2d(c) else: - # this computes too much (most of it is thrown away)` - c = np.corrcoef(x1, x2, rowvar=self.axis == 1) - if np.isscalar(c): - return np.atleast_2d(c) - n1 = x1.shape[1 - self.axis] - n2 = x2.shape[1 - self.axis] - assert c.shape[0] == c.shape[1] == n1 + n2 - return c[:n1, n1:].copy() + return _corrcoef2(x1, x2, axis=self.axis) class PearsonR(CorrelationDistance): From f83792039fd792b71d8b9f69e16481e96174a0c3 Mon Sep 17 00:00:00 2001 From: Ales Erjavec Date: Tue, 16 Jan 2018 13:54:33 +0100 Subject: [PATCH 3/3] tests: Add tests for implementations of _corrcoef2 and _spearmanr2 --- Orange/distance/distance.py | 5 ++-- Orange/tests/test_distances.py | 53 ++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py index c49045dcf1c..40bb510eced 100644 --- a/Orange/distance/distance.py +++ b/Orange/distance/distance.py @@ -507,6 +507,9 @@ def _corrcoef2(a, b, axis=0): numpy.corrcoef """ a, b = np.atleast_2d(a, b) + if not (axis == 0 or axis == 1): + raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis)) + mean_a = np.mean(a, axis=axis, keepdims=True) mean_b = np.mean(b, axis=axis, keepdims=True) assert a.shape[axis] == b.shape[axis] @@ -523,8 +526,6 @@ def _corrcoef2(a, b, axis=0): elif axis == 1: C = a.dot(b.T) assert C.shape == (n, m) - else: - raise ValueError() ss_a = np.sum(a ** 2, axis=axis, keepdims=True) ss_b = np.sum(b ** 2, axis=axis, keepdims=True) diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py index 46693b2aafc..3864a188586 100644 --- a/Orange/tests/test_distances.py +++ b/Orange/tests/test_distances.py @@ -6,6 +6,8 @@ import numpy as np import scipy +import scipy.spatial +import scipy.stats from scipy.sparse import csr_matrix from Orange.data import (Table, Domain, ContinuousVariable, @@ -13,6 +15,7 @@ from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute, PearsonR, PearsonRAbsolute, Manhattan, Cosine, Jaccard, _preprocess, MahalanobisDistance) +from Orange.distance.distance import _spearmanr2, _corrcoef2 from Orange.misc import DistMatrix from Orange.tests import named_file, test_filename from Orange.util import OrangeDeprecationWarning @@ -598,6 +601,30 @@ def test_spearmanr_distance_numpy(self): [0.3833333], [0.]])) + def test_spearmanr2(self): + # Test that _spearnmanr2 returns the same result that stats.spearmanr + # would + n, m = tuple(np.random.randint(2, 5, size=2)) + mean = np.random.uniform(-1, 1, size=m) + cov = np.random.uniform(0, 1./m, size=(m, m)) + cov = (cov + cov.T) / 2 + cov.flat[::m + 1] = 1.0 + X1 = np.random.multivariate_normal(mean, cov, size=n) + X2 = np.random.multivariate_normal(mean, cov, size=n) + expected = scipy.stats.spearmanr(X1, X2, axis=1)[0][:n, n:] + np.testing.assert_almost_equal( + _spearmanr2(X1, X2, axis=1), + expected, + decimal=9 + ) + + expected = scipy.stats.spearmanr(X1, X2, axis=0)[0][:m, m:] + np.testing.assert_almost_equal( + _spearmanr2(X1, X2, axis=0), + expected, + decimal=9, + ) + # noinspection PyTypeChecker class TestSpearmanRAbsolute(TestCase): @@ -752,6 +779,32 @@ def test_pearsonr_distance_numpy(self): [0.32783865], [0.]])) + def test_corrcoef2(self): + # Test that _corrcoef2 returns the same result that np.corrcoef would + n, m = tuple(np.random.randint(2, 5, size=2)) + mean = np.random.uniform(-1, 1, size=m) + cov = np.random.uniform(0, 1./m, size=(m, m)) + cov = (cov + cov.T) / 2 + cov.flat[::m + 1] = 1.0 + X1 = np.random.multivariate_normal(mean, cov, size=n) + X2 = np.random.multivariate_normal(mean, cov, size=n) + expected = np.corrcoef(X1, X2, rowvar=True)[:n, n:] + np.testing.assert_almost_equal( + _corrcoef2(X1, X2, axis=1), + expected, + decimal=9 + ) + + expected = np.corrcoef(X1, X2, rowvar=False)[:m, m:] + np.testing.assert_almost_equal( + _corrcoef2(X1, X2, axis=0), + expected, + decimal=9, + ) + + with self.assertRaises(ValueError): + _corrcoef2(X1, X2, axis=10) + # noinspection PyTypeChecker class TestPearsonRAbsolute(TestCase):