Skip to content

Commit

Permalink
PCA: Preserve f32s & reduce memory footprint when computing means
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar committed Feb 10, 2019
1 parent 920e474 commit e714b48
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 17 deletions.
13 changes: 11 additions & 2 deletions Orange/projection/pca.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import numbers

import six
import numpy as np
import scipy.sparse as sp
from scipy.linalg import lu, qr, svd

from sklearn import decomposition as skl_decomposition
from sklearn.utils import check_array, check_random_state
from sklearn.utils.extmath import svd_flip, safe_sparse_dot
from sklearn.utils.sparsefuncs import mean_variance_axis
from sklearn.utils.validation import check_is_fitted

try:
Expand Down Expand Up @@ -45,10 +46,14 @@ def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",

n_samples, n_features = A.shape

c = np.atleast_2d(A.mean(axis=0))
c = np.atleast_2d(ut.nanmean(A, axis=0))

if n_samples >= n_features:
Q = random_state.normal(size=(n_features, n_components + n_oversamples))
if A.dtype.kind == "f":
# Ensure f32 is preserved as f32
Q = Q.astype(A.dtype, copy=False)

Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)

# Normalized power iterations
Expand All @@ -66,6 +71,10 @@ def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",

else: # n_features > n_samples
Q = random_state.normal(size=(n_samples, n_components + n_oversamples))
if A.dtype.kind == "f":
# Ensure f32 is preserved as f32
Q = Q.astype(A.dtype, copy=False)

Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])

# Normalized power iterations
Expand Down
18 changes: 10 additions & 8 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import bottleneck as bn
from scipy import sparse as sp
import scipy.stats.stats
from sklearn.utils.sparsefuncs import mean_variance_axis


def _count_nans_per_row_sparse(X, weights, dtype=None):
Expand Down Expand Up @@ -422,14 +423,15 @@ def nansum_sparse(x):

def nanmean(x, axis=None):
""" Equivalent of np.nanmean that supports sparse or dense matrices. """
def nanmean_sparse(x):
n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
if not n_values:
warnings.warn(RuntimeWarning, "Mean of empty slice")
return np.nan
return np.nansum(x.data) / n_values

return _apply_func(x, np.nanmean, nanmean_sparse, axis=axis)
if not sp.issparse(x):
means = np.nanmean(x, axis=axis)
elif axis is None:
means, _ = mean_variance_axis(x, axis=0)
means = np.nanmean(means)
else:
means, _ = mean_variance_axis(x, axis=axis)

return means


def nanvar(x, axis=None, ddof=0):
Expand Down
33 changes: 26 additions & 7 deletions Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from Orange.statistics.util import bincount, countnans, contingency, digitize, \
mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
unique, var, nanstd, nanvar, nanmode
from sklearn.utils import check_random_state


def dense_sparse(test_case):
Expand Down Expand Up @@ -166,13 +167,6 @@ def test_mean(self):
with self.assertWarns(UserWarning):
mean([1, np.nan, 0])

def test_nanmean(self):
for X in self.data:
X_sparse = csr_matrix(X)
np.testing.assert_array_equal(
nanmean(X_sparse),
np.nanmean(X))

def test_nanmode(self):
X = np.array([[np.nan, np.nan, 1, 1],
[2, np.nan, 1, 1]])
Expand Down Expand Up @@ -270,6 +264,31 @@ def test_nanstd_with_ddof(self):
)


class TestNanmean(unittest.TestCase):
def setUp(self):
self.random_state = check_random_state(42)
self.x = self.random_state.uniform(size=(10, 5))
np.fill_diagonal(self.x, np.nan)

@dense_sparse
def test_axis_none(self, array):
np.testing.assert_almost_equal(
np.nanmean(self.x), nanmean(array(self.x))
)

@dense_sparse
def test_axis_0(self, array):
np.testing.assert_almost_equal(
np.nanmean(self.x, axis=0), nanmean(array(self.x), axis=0)
)

@dense_sparse
def test_axis_1(self, array):
np.testing.assert_almost_equal(
np.nanmean(self.x, axis=1), nanmean(array(self.x), axis=1)
)


class TestDigitize(unittest.TestCase):
def setUp(self):
# pylint: disable=bad-whitespace
Expand Down

0 comments on commit e714b48

Please sign in to comment.