From b5264000f20db429a3bd371ec66178c9fb49e5ce Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Thu, 4 Jan 2018 14:27:47 +0100
Subject: [PATCH 1/3] distance: Speed and memory optimization

* Use numpy.corrcoef in PearsonR
* Optimize PearsonR/SpearmanR when computing pairwise distances on
  a single input table
---
 Orange/distance/distance.py | 46 ++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
index bd270de9806..3c1b7bb12d8 100644
--- a/Orange/distance/distance.py
+++ b/Orange/distance/distance.py
@@ -418,8 +418,6 @@ def __init__(self, absolute, axis=1, impute=False):
         self.absolute = absolute
 
     def compute_distances(self, x1, x2):
-        if x2 is None:
-            x2 = x1
         rho = self.compute_correlation(x1, x2)
         if self.absolute:
             return (1. - np.abs(rho)) / 2.
@@ -432,11 +430,29 @@ def compute_correlation(self, x1, x2):
 
 class SpearmanModel(CorrelationDistanceModel):
     def compute_correlation(self, x1, x2):
-        rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
-        if isinstance(rho, np.float):
-            return np.array([[rho]])
-        slc = x1.shape[1 - self.axis]
-        return rho[:slc, slc:]
+        n1 = x1.shape[1 - self.axis]
+        n2 = x2.shape[1 - self.axis] if x2 is not None else 0
+        if x2 is None:
+            if n1 == 2:
+                # Special case to properly fill degenerate self correlations
+                # (nan, inf on the diagonals)
+                rho = stats.spearmanr(x1, x1, axis=self.axis)[0]
+                assert rho.shape == (4, 4)
+                rho = rho[:2, :2].copy()
+            else:
+                # scalar if n1 == 1
+                rho = stats.spearmanr(x1, axis=self.axis)[0]
+            return np.atleast_2d(rho)
+        else:
+            # this computes too much (most of it is thrown away)
+            rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
+            if np.isscalar(rho):
+                # scalar if n1 + n2 <= 2
+                assert n1 + n2 <= 2
+                return np.atleast_2d(rho)
+            else:
+                assert rho.shape == (n1 + n2, n1 + n2)
+                return rho[:n1, n1:].copy()
 
 
 class CorrelationDistance(Distance):
@@ -455,10 +471,18 @@ def fit(self, _):
 
 class PearsonModel(CorrelationDistanceModel):
     def compute_correlation(self, x1, x2):
-        if self.axis == 0:
-            x1 = x1.T
-            x2 = x2.T
-        return np.array([[stats.pearsonr(i, j)[0] for j in x2] for i in x1])
+        if x2 is None:
+            c = np.corrcoef(x1, rowvar=self.axis == 1)
+            return np.atleast_2d(c)
+        else:
+            # this computes too much (most of it is thrown away)`
+            c = np.corrcoef(x1, x2, rowvar=self.axis == 1)
+            if np.isscalar(c):
+                return np.atleast_2d(c)
+            n1 = x1.shape[1 - self.axis]
+            n2 = x2.shape[1 - self.axis]
+            assert c.shape[0] == c.shape[1] == n1 + n2
+            return c[:n1, n1:].copy()
 
 
 class PearsonR(CorrelationDistance):

From 3d10937eeb0ec39ef004aca2793d27973fae45d2 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Thu, 4 Jan 2018 16:35:22 +0100
Subject: [PATCH 2/3] distances: Optimize PearsonR/SpearmanR ...

... for the case where computing distances from two tables.
---
 Orange/distance/distance.py | 119 ++++++++++++++++++++++++++++++------
 1 file changed, 99 insertions(+), 20 deletions(-)

diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
index 3c1b7bb12d8..c49045dcf1c 100644
--- a/Orange/distance/distance.py
+++ b/Orange/distance/distance.py
@@ -425,14 +425,13 @@ def compute_distances(self, x1, x2):
             return (1. - rho) / 2.
 
     def compute_correlation(self, x1, x2):
-        pass
+        raise NotImplementedError()
 
 
 class SpearmanModel(CorrelationDistanceModel):
     def compute_correlation(self, x1, x2):
-        n1 = x1.shape[1 - self.axis]
-        n2 = x2.shape[1 - self.axis] if x2 is not None else 0
         if x2 is None:
+            n1 = x1.shape[1 - self.axis]
             if n1 == 2:
                 # Special case to properly fill degenerate self correlations
                 # (nan, inf on the diagonals)
@@ -444,15 +443,102 @@ def compute_correlation(self, x1, x2):
                 rho = stats.spearmanr(x1, axis=self.axis)[0]
             return np.atleast_2d(rho)
         else:
-            # this computes too much (most of it is thrown away)
-            rho = stats.spearmanr(x1, x2, axis=self.axis)[0]
-            if np.isscalar(rho):
-                # scalar if n1 + n2 <= 2
-                assert n1 + n2 <= 2
-                return np.atleast_2d(rho)
-            else:
-                assert rho.shape == (n1 + n2, n1 + n2)
-                return rho[:n1, n1:].copy()
+            return _spearmanr2(x1, x2, axis=self.axis)
+
+
+def _spearmanr2(a, b, axis=0):
+    """
+    Compute all pairwise spearman rank moment correlations between rows
+    or columns of a and b
+
+    Parameters
+    ----------
+    a : (N, M) numpy.ndarray
+        The input cases a.
+    b : (J, K) numpy.ndarray
+        The input cases b. In case of axis == 0: J must equal N;
+        otherwise if axis == 1 then K must equal M.
+    axis : int
+        If 0 the correlation are computed between a and b's columns.
+        Otherwise if 1 the correlations are computed between rows.
+
+    Returns
+    -------
+    cor : (N, J) or (M, K) nd.array
+        If axis == 0 then (N, J) matrix of correlations between a x b columns
+        else a (N, J) matrix of correlations between a x b rows.
+
+    See Also
+    --------
+    scipy.stats.spearmanr
+    """
+    a, b = np.atleast_2d(a, b)
+    assert a.shape[axis] == b.shape[axis]
+    ar = np.apply_along_axis(stats.rankdata, axis, a)
+    br = np.apply_along_axis(stats.rankdata, axis, b)
+
+    return _corrcoef2(ar, br, axis=axis)
+
+
+def _corrcoef2(a, b, axis=0):
+    """
+    Compute all pairwise Pearson product-moment correlation coefficients
+    between rows or columns of a and b
+
+    Parameters
+    ----------
+    a : (N, M) numpy.ndarray
+        The input cases a.
+    b : (J, K) numpy.ndarray
+        The input cases b. In case of axis == 0: J must equal N;
+        otherwise if axis == 1 then K must equal M.
+    axis : int
+        If 0 the correlation are computed between a and b's columns.
+        Otherwise if 1 the correlations are computed between rows.
+
+    Returns
+    -------
+    cor : (N, J) or (M, K) nd.array
+        If axis == 0 then (N, J) matrix of correlations between a x b columns
+        else a (N, J) matrix of correlations between a x b rows.
+
+    See Also
+    --------
+    numpy.corrcoef
+    """
+    a, b = np.atleast_2d(a, b)
+    mean_a = np.mean(a, axis=axis, keepdims=True)
+    mean_b = np.mean(b, axis=axis, keepdims=True)
+    assert a.shape[axis] == b.shape[axis]
+
+    n = a.shape[1 - axis]
+    m = b.shape[1 - axis]
+
+    a = a - mean_a
+    b = b - mean_b
+
+    if axis == 0:
+        C = a.T.dot(b)
+        assert C.shape == (n, m)
+    elif axis == 1:
+        C = a.dot(b.T)
+        assert C.shape == (n, m)
+    else:
+        raise ValueError()
+
+    ss_a = np.sum(a ** 2, axis=axis, keepdims=True)
+    ss_b = np.sum(b ** 2, axis=axis, keepdims=True)
+
+    if axis == 0:
+        ss_a = ss_a.T
+    else:
+        ss_b = ss_b.T
+
+    assert ss_a.shape == (n, 1)
+    assert ss_b.shape == (1, m)
+    C /= np.sqrt(ss_a)
+    C /= np.sqrt(ss_b)
+    return C
 
 
 class CorrelationDistance(Distance):
@@ -475,14 +561,7 @@ def compute_correlation(self, x1, x2):
             c = np.corrcoef(x1, rowvar=self.axis == 1)
             return np.atleast_2d(c)
         else:
-            # this computes too much (most of it is thrown away)`
-            c = np.corrcoef(x1, x2, rowvar=self.axis == 1)
-            if np.isscalar(c):
-                return np.atleast_2d(c)
-            n1 = x1.shape[1 - self.axis]
-            n2 = x2.shape[1 - self.axis]
-            assert c.shape[0] == c.shape[1] == n1 + n2
-            return c[:n1, n1:].copy()
+            return _corrcoef2(x1, x2, axis=self.axis)
 
 
 class PearsonR(CorrelationDistance):

From f83792039fd792b71d8b9f69e16481e96174a0c3 Mon Sep 17 00:00:00 2001
From: Ales Erjavec <ales.erjavec@fri.uni-lj.si>
Date: Tue, 16 Jan 2018 13:54:33 +0100
Subject: [PATCH 3/3] tests: Add tests for implementations of _corrcoef2 and
 _spearmanr2

---
 Orange/distance/distance.py    |  5 ++--
 Orange/tests/test_distances.py | 53 ++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
index c49045dcf1c..40bb510eced 100644
--- a/Orange/distance/distance.py
+++ b/Orange/distance/distance.py
@@ -507,6 +507,9 @@ def _corrcoef2(a, b, axis=0):
     numpy.corrcoef
     """
     a, b = np.atleast_2d(a, b)
+    if not (axis == 0 or axis == 1):
+        raise ValueError("Invalid axis {} (only 0 or 1 accepted)".format(axis))
+
     mean_a = np.mean(a, axis=axis, keepdims=True)
     mean_b = np.mean(b, axis=axis, keepdims=True)
     assert a.shape[axis] == b.shape[axis]
@@ -523,8 +526,6 @@ def _corrcoef2(a, b, axis=0):
     elif axis == 1:
         C = a.dot(b.T)
         assert C.shape == (n, m)
-    else:
-        raise ValueError()
 
     ss_a = np.sum(a ** 2, axis=axis, keepdims=True)
     ss_b = np.sum(b ** 2, axis=axis, keepdims=True)
diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py
index 46693b2aafc..3864a188586 100644
--- a/Orange/tests/test_distances.py
+++ b/Orange/tests/test_distances.py
@@ -6,6 +6,8 @@
 
 import numpy as np
 import scipy
+import scipy.spatial
+import scipy.stats
 from scipy.sparse import csr_matrix
 
 from Orange.data import (Table, Domain, ContinuousVariable,
@@ -13,6 +15,7 @@
 from Orange.distance import (Euclidean, SpearmanR, SpearmanRAbsolute,
                              PearsonR, PearsonRAbsolute, Manhattan, Cosine,
                              Jaccard, _preprocess, MahalanobisDistance)
+from Orange.distance.distance import _spearmanr2, _corrcoef2
 from Orange.misc import DistMatrix
 from Orange.tests import named_file, test_filename
 from Orange.util import OrangeDeprecationWarning
@@ -598,6 +601,30 @@ def test_spearmanr_distance_numpy(self):
                       [0.3833333],
                       [0.]]))
 
+    def test_spearmanr2(self):
+        # Test that _spearnmanr2 returns the same result that stats.spearmanr
+        # would
+        n, m = tuple(np.random.randint(2, 5, size=2))
+        mean = np.random.uniform(-1, 1, size=m)
+        cov = np.random.uniform(0, 1./m, size=(m, m))
+        cov = (cov + cov.T) / 2
+        cov.flat[::m + 1] = 1.0
+        X1 = np.random.multivariate_normal(mean, cov, size=n)
+        X2 = np.random.multivariate_normal(mean, cov, size=n)
+        expected = scipy.stats.spearmanr(X1, X2, axis=1)[0][:n, n:]
+        np.testing.assert_almost_equal(
+            _spearmanr2(X1, X2, axis=1),
+            expected,
+            decimal=9
+        )
+
+        expected = scipy.stats.spearmanr(X1, X2, axis=0)[0][:m, m:]
+        np.testing.assert_almost_equal(
+            _spearmanr2(X1, X2, axis=0),
+            expected,
+            decimal=9,
+        )
+
 
 # noinspection PyTypeChecker
 class TestSpearmanRAbsolute(TestCase):
@@ -752,6 +779,32 @@ def test_pearsonr_distance_numpy(self):
                       [0.32783865],
                       [0.]]))
 
+    def test_corrcoef2(self):
+        # Test that _corrcoef2 returns the same result that np.corrcoef would
+        n, m = tuple(np.random.randint(2, 5, size=2))
+        mean = np.random.uniform(-1, 1, size=m)
+        cov = np.random.uniform(0, 1./m, size=(m, m))
+        cov = (cov + cov.T) / 2
+        cov.flat[::m + 1] = 1.0
+        X1 = np.random.multivariate_normal(mean, cov, size=n)
+        X2 = np.random.multivariate_normal(mean, cov, size=n)
+        expected = np.corrcoef(X1, X2, rowvar=True)[:n, n:]
+        np.testing.assert_almost_equal(
+            _corrcoef2(X1, X2, axis=1),
+            expected,
+            decimal=9
+        )
+
+        expected = np.corrcoef(X1, X2, rowvar=False)[:m, m:]
+        np.testing.assert_almost_equal(
+            _corrcoef2(X1, X2, axis=0),
+            expected,
+            decimal=9,
+        )
+
+        with self.assertRaises(ValueError):
+            _corrcoef2(X1, X2, axis=10)
+
 
 # noinspection PyTypeChecker
 class TestPearsonRAbsolute(TestCase):