Skip to content

Commit

Permalink
Merge pull request #2852 from ales-erjavec/distances-correlations-opt…
Browse files Browse the repository at this point in the history
…imization

[ENH] Distances: Optimize PearsonR/SpearmanR
  • Loading branch information
thocevar authored Feb 26, 2018
2 parents 3fdba8d + f837920 commit b09b282
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 12 deletions.
128 changes: 116 additions & 12 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,25 +418,128 @@ def __init__(self, absolute, axis=1, impute=False):
self.absolute = absolute

def compute_distances(self, x1, x2):
if x2 is None:
x2 = x1
rho = self.compute_correlation(x1, x2)
if self.absolute:
return (1. - np.abs(rho)) / 2.
else:
return (1. - rho) / 2.

def compute_correlation(self, x1, x2):
pass
raise NotImplementedError()


class SpearmanModel(CorrelationDistanceModel):
def compute_correlation(self, x1, x2):
rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
if isinstance(rho, np.float):
return np.array([[rho]])
slc = x1.shape[1 - self.axis]
return rho[:slc, slc:]
if x2 is None:
n1 = x1.shape[1 - self.axis]
if n1 == 2:
# Special case to properly fill degenerate self correlations
# (nan, inf on the diagonals)
rho = stats.spearmanr(x1, x1, axis=self.axis)[0]
assert rho.shape == (4, 4)
rho = rho[:2, :2].copy()
else:
# scalar if n1 == 1
rho = stats.spearmanr(x1, axis=self.axis)[0]
return np.atleast_2d(rho)
else:
return _spearmanr2(x1, x2, axis=self.axis)


def _spearmanr2(a, b, axis=0):
"""
Compute all pairwise spearman rank moment correlations between rows
or columns of a and b
Parameters
----------
a : (N, M) numpy.ndarray
The input cases a.
b : (J, K) numpy.ndarray
The input cases b. In case of axis == 0: J must equal N;
otherwise if axis == 1 then K must equal M.
axis : int
If 0 the correlation are computed between a and b's columns.
Otherwise if 1 the correlations are computed between rows.
Returns
-------
cor : (N, J) or (M, K) nd.array
If axis == 0 then (N, J) matrix of correlations between a x b columns
else a (N, J) matrix of correlations between a x b rows.
See Also
--------
scipy.stats.spearmanr
"""
a, b = np.atleast_2d(a, b)
assert a.shape[axis] == b.shape[axis]
ar = np.apply_along_axis(stats.rankdata, axis, a)
br = np.apply_along_axis(stats.rankdata, axis, b)

return _corrcoef2(ar, br, axis=axis)


def _corrcoef2(a, b, axis=0):
"""
Compute all pairwise Pearson product-moment correlation coefficients
between rows or columns of a and b
Parameters
----------
a : (N, M) numpy.ndarray
The input cases a.
b : (J, K) numpy.ndarray
The input cases b. In case of axis == 0: J must equal N;
otherwise if axis == 1 then K must equal M.
axis : int
If 0 the correlation are computed between a and b's columns.
Otherwise if 1 the correlations are computed between rows.
Returns
-------
cor : (N, J) or (M, K) nd.array
If axis == 0 then (N, J) matrix of correlations between a x b columns
else a (N, J) matrix of correlations between a x b rows.
See Also
--------
numpy.corrcoef
"""
a, b = np.atleast_2d(a, b)
if not (axis == 0 or axis == 1):
raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis))

mean_a = np.mean(a, axis=axis, keepdims=True)
mean_b = np.mean(b, axis=axis, keepdims=True)
assert a.shape[axis] == b.shape[axis]

n = a.shape[1 - axis]
m = b.shape[1 - axis]

a = a - mean_a
b = b - mean_b

if axis == 0:
C = a.T.dot(b)
assert C.shape == (n, m)
elif axis == 1:
C = a.dot(b.T)
assert C.shape == (n, m)

ss_a = np.sum(a ** 2, axis=axis, keepdims=True)
ss_b = np.sum(b ** 2, axis=axis, keepdims=True)

if axis == 0:
ss_a = ss_a.T
else:
ss_b = ss_b.T

assert ss_a.shape == (n, 1)
assert ss_b.shape == (1, m)
C /= np.sqrt(ss_a)
C /= np.sqrt(ss_b)
return C


class CorrelationDistance(Distance):
Expand All @@ -455,10 +558,11 @@ def fit(self, _):

class PearsonModel(CorrelationDistanceModel):
def compute_correlation(self, x1, x2):
if self.axis == 0:
x1 = x1.T
x2 = x2.T
return np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1])
if x2 is None:
c = np.corrcoef(x1, rowvar=self.axis == 1)
return np.atleast_2d(c)
else:
return _corrcoef2(x1, x2, axis=self.axis)


class PearsonR(CorrelationDistance):
Expand Down
53 changes: 53 additions & 0 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@

import numpy as np
import scipy
import scipy.spatial
import scipy.stats
from scipy.sparse import csr_matrix

from Orange.data import (Table, Domain, ContinuousVariable,
DiscreteVariable, StringVariable, Instance)
from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute,
PearsonR, PearsonRAbsolute, Manhattan, Cosine,
Jaccard, _preprocess, MahalanobisDistance)
from Orange.distance.distance import _spearmanr2, _corrcoef2
from Orange.misc import DistMatrix
from Orange.tests import named_file, test_filename
from Orange.util import OrangeDeprecationWarning
Expand Down Expand Up @@ -598,6 +601,30 @@ def test_spearmanr_distance_numpy(self):
[0.3833333],
[0.]]))

def test_spearmanr2(self):
# Test that _spearnmanr2 returns the same result that stats.spearmanr
# would
n, m = tuple(np.random.randint(2, 5, size=2))
mean = np.random.uniform(-1, 1, size=m)
cov = np.random.uniform(0, 1./m, size=(m, m))
cov = (cov + cov.T) / 2
cov.flat[::m + 1] = 1.0
X1 = np.random.multivariate_normal(mean, cov, size=n)
X2 = np.random.multivariate_normal(mean, cov, size=n)
expected = scipy.stats.spearmanr(X1, X2, axis=1)[0][:n, n:]
np.testing.assert_almost_equal(
_spearmanr2(X1, X2, axis=1),
expected,
decimal=9
)

expected = scipy.stats.spearmanr(X1, X2, axis=0)[0][:m, m:]
np.testing.assert_almost_equal(
_spearmanr2(X1, X2, axis=0),
expected,
decimal=9,
)


# noinspection PyTypeChecker
class TestSpearmanRAbsolute(TestCase):
Expand Down Expand Up @@ -752,6 +779,32 @@ def test_pearsonr_distance_numpy(self):
[0.32783865],
[0.]]))

def test_corrcoef2(self):
# Test that _corrcoef2 returns the same result that np.corrcoef would
n, m = tuple(np.random.randint(2, 5, size=2))
mean = np.random.uniform(-1, 1, size=m)
cov = np.random.uniform(0, 1./m, size=(m, m))
cov = (cov + cov.T) / 2
cov.flat[::m + 1] = 1.0
X1 = np.random.multivariate_normal(mean, cov, size=n)
X2 = np.random.multivariate_normal(mean, cov, size=n)
expected = np.corrcoef(X1, X2, rowvar=True)[:n, n:]
np.testing.assert_almost_equal(
_corrcoef2(X1, X2, axis=1),
expected,
decimal=9
)

expected = np.corrcoef(X1, X2, rowvar=False)[:m, m:]
np.testing.assert_almost_equal(
_corrcoef2(X1, X2, axis=0),
expected,
decimal=9,
)

with self.assertRaises(ValueError):
_corrcoef2(X1, X2, axis=10)


# noinspection PyTypeChecker
class TestPearsonRAbsolute(TestCase):
Expand Down

0 comments on commit b09b282

Please sign in to comment.