diff --git a/Orange/data/domain.py b/Orange/data/domain.py index bd06436f7f8..e7c6f779cee 100644 --- a/Orange/data/domain.py +++ b/Orange/data/domain.py @@ -44,6 +44,18 @@ class DomainConversion: .. attribute:: metas Indices for meta attributes + + .. attribute:: sparse_X + + Flag whether the resulting X matrix should be sparse. + + .. attribute:: sparse_Y + + Flag whether the resulting Y matrix should be sparse. + + .. attribute:: sparse_metas + + Flag whether the resulting metas matrix should be sparse. """ def __init__(self, source, destination): @@ -63,6 +75,21 @@ def __init__(self, source, destination): source.index(var) if var in source else var.compute_value for var in destination.metas] + def should_be_sparse(feats): + """ + For a matrix to be stored in sparse, more than 2/3 of columns + should be marked as sparse and there should be no string columns + since Scipy's sparse matrices don't support dtype=object. + """ + fraction_sparse = sum(f.sparse for f in feats) / max(len(feats), 1) + contain_strings = any(f.is_string for f in feats) + return fraction_sparse > 2/3 and not contain_strings + + # check whether X, Y or metas should be sparse + self.sparse_X = should_be_sparse(destination.attributes) + self.sparse_Y = should_be_sparse(destination.class_vars) + self.sparse_metas = should_be_sparse(destination.metas) + def filter_visible(feats): """ diff --git a/Orange/data/table.py b/Orange/data/table.py index 8b4cbbdd32a..db1312fe48e 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -17,7 +17,8 @@ Domain, Variable, Storage, StringVariable, Unknown, Value, Instance, ContinuousVariable, DiscreteVariable, MISSING_VALUES ) -from Orange.data.util import SharedComputeValue, vstack, hstack +from Orange.data.util import SharedComputeValue, vstack, hstack, assure_array_dense, assure_array_sparse, \ + assure_column_dense, assure_column_sparse from Orange.statistics.util import bincount, countnans, contingency, \ stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \ sparse_implicit_zero_weights @@ -280,44 +281,38 @@ def from_table(cls, domain, source, row_indices=...): global _conversion_cache - def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, - is_sparse=False): - + def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, is_sparse=False): if not len(src_cols): if is_sparse: return sp.csr_matrix((n_rows, 0), dtype=source.X.dtype) else: return np.zeros((n_rows, 0), dtype=source.X.dtype) + # match density for subarrays + match_density = assure_array_sparse if is_sparse else assure_array_dense n_src_attrs = len(source.domain.attributes) if all(isinstance(x, Integral) and 0 <= x < n_src_attrs for x in src_cols): - return _subarray(source.X, row_indices, src_cols) + return match_density(_subarray(source.X, row_indices, src_cols)) if all(isinstance(x, Integral) and x < 0 for x in src_cols): - arr = _subarray(source.metas, row_indices, - [-1 - x for x in src_cols]) + arr = match_density(_subarray(source.metas, row_indices, + [-1 - x for x in src_cols])) if arr.dtype != dtype: return arr.astype(dtype) return arr if all(isinstance(x, Integral) and x >= n_src_attrs for x in src_cols): - return _subarray(source._Y, row_indices, - [x - n_src_attrs for x in src_cols]) + return match_density(_subarray( + source._Y, row_indices, + [x - n_src_attrs for x in src_cols])) + # initialize final array & set `match_density` for columns if is_sparse: a = sp.dok_matrix((n_rows, len(src_cols)), dtype=dtype) + match_density = assure_column_sparse else: a = np.empty((n_rows, len(src_cols)), dtype=dtype) - - def match_type(x): - """ Assure that matrix and column are both dense or sparse. """ - if is_sparse == sp.issparse(x): - return x - elif is_sparse: - x = np.asarray(x) - return sp.csc_matrix(x.reshape(-1, 1).astype(np.float)) - else: - return np.ravel(x.toarray()) + match_density = assure_column_dense shared_cache = _conversion_cache for i, col in enumerate(src_cols): @@ -330,22 +325,22 @@ def match_type(x): col.compute_shared(source) shared = shared_cache[id(col.compute_shared), id(source)] if row_indices is not ...: - a[:, i] = match_type( + a[:, i] = match_density( col(source, shared_data=shared)[row_indices]) else: - a[:, i] = match_type( + a[:, i] = match_density( col(source, shared_data=shared)) else: if row_indices is not ...: - a[:, i] = match_type(col(source)[row_indices]) + a[:, i] = match_density(col(source)[row_indices]) else: - a[:, i] = match_type(col(source)) + a[:, i] = match_density(col(source)) elif col < 0: - a[:, i] = match_type(source.metas[row_indices, -1 - col]) + a[:, i] = match_density(source.metas[row_indices, -1 - col]) elif col < n_src_attrs: - a[:, i] = match_type(source.X[row_indices, col]) + a[:, i] = match_density(source.X[row_indices, col]) else: - a[:, i] = match_type( + a[:, i] = match_density( source._Y[row_indices, col - n_src_attrs]) if is_sparse: @@ -366,6 +361,8 @@ def match_type(x): table = cls.from_table_rows(source, row_indices) # assure resulting domain is the instance passed on input table.domain = domain + # since sparse flags are not considered when checking for domain equality, fix manually. + table = assure_domain_conversion_sparsity(table, source) return table if isinstance(row_indices, slice): @@ -382,18 +379,19 @@ def match_type(x): self.domain = domain conversion = domain.get_conversion(source.domain) self.X = get_columns(row_indices, conversion.attributes, n_rows, - is_sparse=sp.issparse(source.X)) + is_sparse=conversion.sparse_X) if self.X.ndim == 1: self.X = self.X.reshape(-1, len(self.domain.attributes)) + self.Y = get_columns(row_indices, conversion.class_vars, n_rows, - is_sparse=sp.issparse(source.Y)) + is_sparse=conversion.sparse_Y) dtype = np.float64 if any(isinstance(var, StringVariable) for var in domain.metas): dtype = np.object self.metas = get_columns(row_indices, conversion.metas, n_rows, dtype, - is_sparse=sp.issparse(source.metas)) + is_sparse=conversion.sparse_metas) if self.metas.ndim == 1: self.metas = self.metas.reshape(-1, len(self.domain.metas)) if source.has_weights(): @@ -1651,6 +1649,40 @@ def guessed_var(i, var_name): self.attributes["old_domain"] = table.domain return self + def to_sparse(self, sparse_attributes=True, sparse_class=False, + sparse_metas=False): + def sparsify(features): + for f in features: + f.sparse = True + + new_domain = self.domain.copy() + + if sparse_attributes: + sparsify(new_domain.attributes) + if sparse_class: + sparsify(new_domain.class_vars) + if sparse_metas: + sparsify(new_domain.metas) + return self.transform(new_domain) + + def to_dense(self, dense_attributes=True, dense_class=True, + dense_metas=True): + def densify(features): + for f in features: + f.sparse = False + + new_domain = self.domain.copy() + + if dense_attributes: + densify(new_domain.attributes) + if dense_class: + densify(new_domain.class_vars) + if dense_metas: + densify(new_domain.metas) + t = self.transform(new_domain) + t.ids = self.ids # preserve indices + return t + def _check_arrays(*arrays, dtype=None): checked = [] @@ -1672,7 +1704,7 @@ def ninstances(array): if ninstances(array) != shape_1: raise ValueError("Leading dimension mismatch (%d != %d)" - % (len(array), shape_1)) + % (ninstances(array), shape_1)) if sp.issparse(array): array.data = np.asarray(array.data) @@ -1743,3 +1775,23 @@ def _rxc_ix(rows, cols): else: r, c = np.ix_(rows, cols) return np.asarray(r, int), np.asarray(c, int) + + +def assure_domain_conversion_sparsity(target, source): + """ + Assure that the table obeys the domain conversion's suggestions about sparsity. + + Args: + target (Table): the target table. + source (Table): the source table. + + Returns: + Table: with fixed sparsity. The sparsity is set as it is recommended by domain conversion + for transformation from source to the target domain. + """ + conversion = target.domain.get_conversion(source.domain) + match_density = [assure_array_dense, assure_array_sparse] + target.X = match_density[conversion.sparse_X](target.X) + target.Y = match_density[conversion.sparse_Y](target.Y) + target.metas = match_density[conversion.sparse_metas](target.metas) + return target diff --git a/Orange/data/util.py b/Orange/data/util.py index 01f39831547..c9ae5a60dfb 100644 --- a/Orange/data/util.py +++ b/Orange/data/util.py @@ -89,3 +89,34 @@ def hstack(arrays): return sp.hstack(arrays) else: return np.hstack(arrays) + + +def assure_array_dense(a): + if sp.issparse(a): + a = a.toarray() + return a + + +def assure_array_sparse(a): + if not sp.issparse(a): + # since x can be a list, cast to np.array + # since x can come from metas with string, cast to float + a = np.asarray(a).astype(np.float) + return sp.csc_matrix(a) + return a + + +def assure_column_sparse(a): + a = assure_array_sparse(a) + # if x of shape (n, ) is passed to csc_matrix constructor, + # the resulting matrix is of shape (1, n) and hence we + # need to transpose it to make it a column + if a.shape[0] == 1: + a = a.T + return a + + +def assure_column_dense(a): + a = assure_array_dense(a) + # column assignments must be of shape (n,) and not (n, 1) + return np.ravel(a) diff --git a/Orange/preprocess/discretize.py b/Orange/preprocess/discretize.py index 0878ba11f7e..30e827c3e73 100644 --- a/Orange/preprocess/discretize.py +++ b/Orange/preprocess/discretize.py @@ -1,7 +1,7 @@ import numpy as np import scipy.sparse as sp -from Orange.data import DiscreteVariable, Domain, Table +from Orange.data import DiscreteVariable, Domain from Orange.data.sql.table import SqlTable from Orange.preprocess.util import _RefuseDataInConstructor from Orange.statistics import distribution, contingency diff --git a/Orange/preprocess/impute.py b/Orange/preprocess/impute.py index a37568e02ea..68f968891a2 100644 --- a/Orange/preprocess/impute.py +++ b/Orange/preprocess/impute.py @@ -157,7 +157,7 @@ def __call__(self, data): column = np.array([float(data[self.variable])]) else: column = np.array(data.get_column_view(self.variable)[0], - copy=True) + copy=True) mask = np.isnan(column) if not np.any(mask): diff --git a/Orange/tests/test_domain.py b/Orange/tests/test_domain.py index 59de6894819..9deef63176c 100644 --- a/Orange/tests/test_domain.py +++ b/Orange/tests/test_domain.py @@ -3,7 +3,7 @@ import warnings from time import time from numbers import Real -from itertools import starmap +from itertools import starmap, chain import unittest import pickle @@ -11,7 +11,7 @@ from numpy.testing import assert_array_equal from Orange.data import ( - ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable, + ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable, Variable, Domain, Table, DomainConversion) from Orange.data.domain import filter_visible from Orange.preprocess import Continuize, Impute @@ -165,7 +165,8 @@ def test_from_numpy_values(self): (0, 2, DiscreteVariable), (18, 23, ContinuousVariable)]: n_rows, n_cols, = aran_max - aran_min, 1 - d = Domain.from_numpy(np.zeros((1, 1)), np.arange(aran_min, aran_max).reshape(n_rows, n_cols)) + d = Domain.from_numpy(np.zeros((1, 1)), + np.arange(aran_min, aran_max).reshape(n_rows, n_cols)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, vartype) if isinstance(vartype, DiscreteVariable): @@ -402,14 +403,14 @@ def test_conversion(self): assert_array_equal(y, np.array([0])) metas_exp = [gender.Unknown, education.Unknown, ssn.Unknown] - def eq(a, b): + def equal(a, b): if isinstance(a, Real) and isinstance(b, Real) and \ np.isnan(a) and np.isnan(b): return True else: return a == b - self.assertTrue(all(starmap(eq, zip(metas, metas_exp)))) + self.assertTrue(all(starmap(equal, zip(metas, metas_exp)))) x, y, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"]) assert_array_equal(x, np.array([42, 13])) @@ -502,6 +503,42 @@ def test_copy(self): self.assertEqual(domain[age].number_of_decimals, 5) self.assertEqual(new_domain[age].number_of_decimals, 10) + def test_domain_conversion_sparsity(self): + destination = Domain( + attributes=[ + ContinuousVariable(name='a'), + ContinuousVariable(name='b'), + ContinuousVariable(name='c'), + ], + class_vars=[DiscreteVariable('d', values=['e'])], + metas=[StringVariable('f')] + ) + + # all dense + source = Domain(attributes=[]) + conversion = DomainConversion(source, destination) + self.assertFalse(conversion.sparse_X) + self.assertFalse(conversion.sparse_Y) + self.assertFalse(conversion.sparse_metas) + + # set destination attributes as sparse + for a in destination.attributes: + a.sparse = True + source = Domain(attributes=[]) + conversion = DomainConversion(source, destination) + self.assertTrue(conversion.sparse_X) + self.assertFalse(conversion.sparse_Y) + self.assertFalse(conversion.sparse_metas) + + # set all destination variable as sparse + for a in chain(destination.variables, destination.metas): + a.sparse = True + source = Domain(attributes=[]) + conversion = DomainConversion(source, destination) + self.assertTrue(conversion.sparse_X) + self.assertTrue(conversion.sparse_Y) + self.assertFalse(conversion.sparse_metas) + class TestDomainFilter(unittest.TestCase): def setUp(self): diff --git a/Orange/tests/test_normalize.py b/Orange/tests/test_normalize.py index b6c42239015..415a6a8854f 100644 --- a/Orange/tests/test_normalize.py +++ b/Orange/tests/test_normalize.py @@ -98,11 +98,11 @@ def test_normalize_transform_by_span_zero_class(self): def test_normalize_sparse(self): domain = Domain([ContinuousVariable(str(i)) for i in range(3)]) # pylint: disable=bad-whitespace - X = sp.csr_matrix(np.array([ + X = np.array([ [0, -1, -2], [0, 1, 2], - ])) - data = Table.from_numpy(domain, X) + ]) + data = Table.from_numpy(domain, X).to_sparse() # pylint: disable=bad-whitespace solution = sp.csr_matrix(np.array([ diff --git a/Orange/tests/test_remove.py b/Orange/tests/test_remove.py index 759f1bf3e92..3b70931062a 100644 --- a/Orange/tests/test_remove.py +++ b/Orange/tests/test_remove.py @@ -4,12 +4,12 @@ import unittest import numpy as np -import scipy.sparse as sp from Orange.data import Table from Orange.preprocess import Remove from Orange.tests import test_filename + class TestRemover(unittest.TestCase): @classmethod @@ -138,8 +138,7 @@ def test_remove_unused_values_metas(self): def test_remove_unused_values_attr_sparse(self): data = self.test8 - data = data[1:] - data.X = sp.csr_matrix(data.X) + data = data[1:].to_sparse() remover = Remove(Remove.RemoveUnusedValues) new_data = remover(data) attr_res = remover.attr_results diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index d48a802be8d..aca641b1d67 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -640,8 +640,7 @@ def test_copy(self): self.assertFalse(np.all(t.metas == copy.metas)) def test_copy_sparse(self): - t = data.Table('iris') - t.X = sp.csr_matrix(t.X) + t = data.Table('iris').to_sparse() copy = t.copy() self.assertEqual((t.X != copy.X).nnz, 0) # sparse matrices match by content @@ -1778,7 +1777,7 @@ def test_can_filter_row_with_slice(self): new_table, self.table, rows=slice_) def test_can_use_attributes_as_new_columns(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) order = [random.randrange(a) for _ in self.domain.attributes] new_attributes = [self.domain.attributes[i] for i in order] new_domain = self.create_domain( @@ -1789,7 +1788,7 @@ def test_can_use_attributes_as_new_columns(self): new_table, self.table, xcols=order, ycols=order, mcols=order) def test_can_use_class_vars_as_new_columns(self): - a, c, m = column_sizes(self.table) + a, c, _ = column_sizes(self.table) order = [random.randrange(a, a + c) for _ in self.domain.class_vars] new_classes = [self.domain.class_vars[i - a] for i in order] new_domain = self.create_domain(new_classes, new_classes, new_classes) @@ -1799,7 +1798,7 @@ def test_can_use_class_vars_as_new_columns(self): new_table, self.table, xcols=order, ycols=order, mcols=order) def test_can_use_metas_as_new_columns(self): - a, c, m = column_sizes(self.table) + _, _, m = column_sizes(self.table) order = [random.randrange(-m + 1, 0) for _ in self.domain.metas] new_metas = [self.domain.metas[::-1][i] for i in order] new_domain = self.create_domain(new_metas, new_metas, new_metas) @@ -1845,12 +1844,11 @@ def test_creates_table_with_given_domain_and_row_filter(self): new_table, self.table[:0], xcols=order, ycols=order, mcols=order) def test_from_table_sparse_move_some_to_empty_metas(self): - iris = data.Table("iris") - iris.X = sp.csr_matrix(iris.X) + iris = data.Table("iris").to_sparse() new_domain = data.domain.Domain( iris.domain.attributes[:2], iris.domain.class_vars, iris.domain.attributes[2:], source=iris.domain) - new_iris = data.Table.from_table(new_domain, iris) + new_iris = iris.transform(new_domain) self.assertTrue(sp.issparse(new_iris.X)) self.assertTrue(sp.issparse(new_iris.metas)) @@ -1858,37 +1856,35 @@ def test_from_table_sparse_move_some_to_empty_metas(self): self.assertEqual(new_iris.metas.shape, (len(iris), 2)) # move back - back_iris = data.Table.from_table(iris.domain, new_iris) + back_iris = new_iris.transform(iris.domain) self.assertEqual(back_iris.domain, iris.domain) self.assertTrue(sp.issparse(back_iris.X)) - self.assertTrue(sp.issparse(back_iris.metas)) + self.assertFalse(sp.issparse(back_iris.metas)) self.assertEqual(back_iris.X.shape, iris.X.shape) self.assertEqual(back_iris.metas.shape, iris.metas.shape) def test_from_table_sparse_move_all_to_empty_metas(self): - iris = data.Table("iris") - iris.X = sp.csr_matrix(iris.X) + iris = data.Table("iris").to_sparse() new_domain = data.domain.Domain( [], iris.domain.class_vars, iris.domain.attributes, source=iris.domain) - new_iris = data.Table.from_table(new_domain, iris) + new_iris = iris.transform(new_domain) - self.assertTrue(sp.issparse(new_iris.X)) + self.assertFalse(sp.issparse(new_iris.X)) self.assertTrue(sp.issparse(new_iris.metas)) self.assertEqual(new_iris.X.shape, (len(iris), 0)) self.assertEqual(new_iris.metas.shape, (len(iris), 4)) # move back - back_iris = data.Table.from_table(iris.domain, new_iris) + back_iris = new_iris.transform(iris.domain) self.assertEqual(back_iris.domain, iris.domain) self.assertTrue(sp.issparse(back_iris.X)) - self.assertTrue(sp.issparse(back_iris.metas)) + self.assertFalse(sp.issparse(back_iris.metas)) self.assertEqual(back_iris.X.shape, iris.X.shape) self.assertEqual(back_iris.metas.shape, iris.metas.shape) def test_from_table_sparse_move_to_nonempty_metas(self): - brown = data.Table("brown-selected") - brown.X = sp.csr_matrix(brown.X) + brown = data.Table("brown-selected").to_sparse() n_attr = len(brown.domain.attributes) n_metas = len(brown.domain.metas) new_domain = data.domain.Domain( @@ -2053,7 +2049,7 @@ def test_can_assign_values(self): self.assertAlmostEqual(self.table.X[0, 0], 42.) def test_can_assign_values_to_classes(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) self.table[0, a] = 42. self.assertAlmostEqual(self.table.Y[0], 42.) @@ -2071,7 +2067,7 @@ def test_can_assign_rows_to_rows(self): self.table.metas[0], self.table.metas[1]) def test_can_assign_lists(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) new_example = [float(i) for i in range(len(self.attributes + self.class_vars))] self.table[0] = new_example @@ -2081,7 +2077,7 @@ def test_can_assign_lists(self): self.table.Y[0], np.array(new_example[a:])) def test_can_assign_np_array(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) new_example = \ np.array([float(i) for i in range(len(self.attributes + self.class_vars))]) @@ -2203,7 +2199,7 @@ def test_delete_rows(self): def test_clear(self): self.table.clear() self.assertEqual(len(self.table), 0) - for i in self.table: + for _ in self.table: self.fail("Table should not contain any rows.") def test_subclasses(self): @@ -2705,6 +2701,81 @@ def _compare_tables(self, table1, table2): for x in table2.domain.metas]) +class SparseCV: + def __call__(self, data): + return sp.csr_matrix((len(data), 1)) + + +class TestTableSparseDense(unittest.TestCase): + def setUp(self): + self.iris = Table('iris') + + def test_sparse_dense_transformation(self): + iris = Table('iris') + iris_sparse = iris.to_sparse(sparse_attributes=True) + self.assertTrue(sp.issparse(iris_sparse.X)) + self.assertFalse(sp.issparse(iris_sparse.Y)) + self.assertFalse(sp.issparse(iris_sparse.metas)) + + iris_sparse = iris.to_sparse(sparse_attributes=True, sparse_class=True) + self.assertTrue(sp.issparse(iris_sparse.X)) + self.assertTrue(sp.issparse(iris_sparse.Y)) + self.assertFalse(sp.issparse(iris_sparse.metas)) + + dense_iris = iris_sparse.to_dense() + self.assertFalse(sp.issparse(dense_iris.X)) + self.assertFalse(sp.issparse(dense_iris.Y)) + self.assertFalse(sp.issparse(dense_iris.metas)) + + def test_from_table_add_one_sparse_column(self): + # add one sparse feature, should remain dense + domain = self.iris.domain.copy() + domain.attributes += ( + ContinuousVariable('S1', compute_value=SparseCV(), sparse=True), + ) + d = self.iris.transform(domain) + self.assertFalse(sp.issparse(d.X)) + + def test_from_table_add_lots_of_sparse_columns(self): + n_attrs = len(self.iris.domain.attributes) + + # add 2*n_attrs+1 sparse feature, should became sparse + domain = self.iris.domain.copy() + domain.attributes += tuple( + ContinuousVariable('S' + str(i), compute_value=SparseCV(), sparse=True) + for i in range(2*n_attrs + 1) + ) + d = self.iris.transform(domain) + self.assertTrue(sp.issparse(d.X)) + + def test_from_table_replace_attrs_with_sparse(self): + # replace attrs with a sparse feature, should became sparse + domain = self.iris.domain.copy() + domain.attributes = ( + ContinuousVariable('S1', compute_value=SparseCV(), sparse=True), + ) + d = self.iris.transform(domain) + self.assertTrue(sp.issparse(d.X)) + + def test_from_table_sparse_metas(self): + # replace metas with a sparse feature, should became sparse + domain = self.iris.domain.copy() + domain._metas = ( + ContinuousVariable('S1', compute_value=SparseCV(), sparse=True), + ) + d = self.iris.transform(domain) + self.assertTrue(sp.issparse(d.metas)) + + def test_from_table_sparse_metas_with_strings(self): + # replace metas with text and 100 sparse features, should be dense + domain = self.iris.domain.copy() + domain._metas = (StringVariable('text'),) + tuple( + ContinuousVariable('S' + str(i), compute_value=SparseCV(), sparse=True) + for i in range(100) + ) + d = self.iris.transform(domain) + self.assertFalse(sp.issparse(d.metas)) + if __name__ == "__main__": unittest.main() diff --git a/Orange/widgets/data/tests/test_owfile.py b/Orange/widgets/data/tests/test_owfile.py index fce9f8299dc..bf3e7a5fa67 100644 --- a/Orange/widgets/data/tests/test_owfile.py +++ b/Orange/widgets/data/tests/test_owfile.py @@ -308,8 +308,7 @@ def test_no_specified_reader(self): self.assertTrue(self.widget.Error.missing_reader.is_shown()) def test_domain_edit_on_sparse_data(self): - iris = Table("iris") - iris.X = sp.csr_matrix(iris.X) + iris = Table("iris").to_sparse() f = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False) pickle.dump(iris, f) diff --git a/Orange/widgets/data/tests/test_owmergedata.py b/Orange/widgets/data/tests/test_owmergedata.py index f125c1fc38a..9348e0c5c1d 100644 --- a/Orange/widgets/data/tests/test_owmergedata.py +++ b/Orange/widgets/data/tests/test_owmergedata.py @@ -435,8 +435,7 @@ def test_sparse(self): """ data = Table("iris")[::25] data_ed_dense = Table("titanic")[::300] - data_ed_sparse = Table("titanic")[::300] - data_ed_sparse.X = sp.csr_matrix(data_ed_sparse.X) + data_ed_sparse = Table("titanic")[::300].to_sparse() self.send_signal("Data", data) self.send_signal("Extra Data", data_ed_dense) diff --git a/Orange/widgets/model/tests/test_tree.py b/Orange/widgets/model/tests/test_tree.py index 18f48061892..2f3d919d39b 100644 --- a/Orange/widgets/model/tests/test_tree.py +++ b/Orange/widgets/model/tests/test_tree.py @@ -1,6 +1,5 @@ # pylint: disable=protected-access import numpy as np -import scipy.sparse as sp from Orange.base import Model from Orange.data import Table @@ -49,8 +48,7 @@ def test_sparse_data_classification(self): table1 = Table("iris") self.send_signal("Data", table1) model_dense = self.get_output("Model") - table2 = Table("iris") - table2.X = sp.csr_matrix(table2.X) + table2 = Table("iris").to_sparse() self.send_signal("Data", table2) model_sparse = self.get_output("Model") self.assertTrue(np.array_equal(model_dense._code, model_sparse._code)) @@ -64,8 +62,7 @@ def test_sparse_data_regression(self): table1 = Table("housing") self.send_signal("Data", table1) model_dense = self.get_output("Model") - table2 = Table("housing") - table2.X = sp.csr_matrix(table2.X) + table2 = Table("housing").to_sparse() self.send_signal("Data", table2) model_sparse = self.get_output("Model") self.assertTrue(np.array_equal(model_dense._code, model_sparse._code)) diff --git a/Orange/widgets/tests/base.py b/Orange/widgets/tests/base.py index 5db342a42e5..42b4d940aa1 100644 --- a/Orange/widgets/tests/base.py +++ b/Orange/widgets/tests/base.py @@ -24,7 +24,8 @@ from Orange.classification.base_classification import ( LearnerClassification, ModelClassification ) -from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable +from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable,\ + Variable from Orange.modelling import Fitter from Orange.preprocess import RemoveNaNColumns, Randomize from Orange.preprocess.preprocess import PreprocessorList @@ -100,6 +101,7 @@ def setUpClass(cls): report = OWReport() cls.widgets.append(report) OWReport.get_instance = lambda: report + Variable._clear_all_caches() def tearDown(self): """Process any pending events before the next test is executed.""" @@ -687,6 +689,7 @@ class WidgetOutputsTestMixin: """ def init(self): + Variable._clear_all_caches() self.data = Table("iris") self.same_input_output_domain = True diff --git a/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py b/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py index 7578637a357..bdca5b1ac48 100644 --- a/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py +++ b/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py @@ -66,8 +66,7 @@ def _compare_tables(self, _output, n_components): np.testing.assert_array_equal(self.iris.metas, _output.metas) def test_sparse_data(self): - data = Table("iris") - data.X = sparse.csr_matrix(data.X) + data = Table("iris").to_sparse() self.assertTrue(sparse.issparse(data.X)) self.widget.manifold_method_index = 2 self.send_signal(self.widget.Inputs.data, data) diff --git a/Orange/widgets/utils/itemmodels.py b/Orange/widgets/utils/itemmodels.py index c99c5fa4cf4..b135f55550b 100644 --- a/Orange/widgets/utils/itemmodels.py +++ b/Orange/widgets/utils/itemmodels.py @@ -1105,7 +1105,7 @@ def make_basket_formater(vars, density, role): elif role == TableModel.ClassVar: getter = operator.attrgetter("sparse_y") elif role == TableModel.Meta: - getter = operator.attrgetter("sparse_meta") + getter = operator.attrgetter("sparse_metas") return partial(formater, vars, getter) def make_basket(vars, density, role): diff --git a/Orange/widgets/visualize/owheatmap.py b/Orange/widgets/visualize/owheatmap.py index 982ed57308f..0c515c4bb97 100644 --- a/Orange/widgets/visualize/owheatmap.py +++ b/Orange/widgets/visualize/owheatmap.py @@ -663,8 +663,7 @@ def set_dataset(self, data=None): if data is not None and sp.issparse(data.X): try: - data = data.copy() - data.X = data.X.toarray() + data = data.to_dense() except MemoryError: data = None self.Error.not_enough_memory() diff --git a/Orange/widgets/visualize/owscatterplotgraph.py b/Orange/widgets/visualize/owscatterplotgraph.py index e60265a0b2a..38f95e718e0 100644 --- a/Orange/widgets/visualize/owscatterplotgraph.py +++ b/Orange/widgets/visualize/owscatterplotgraph.py @@ -623,13 +623,7 @@ def sparse_to_dense(self): domain = data.domain all_attrs = domain.variables + domain.metas attrs = list(set(all_attrs) & attrs) - selected_data = data[:, attrs] - if sp.issparse(selected_data.X): - selected_data.X = selected_data.X.toarray() - if sp.issparse(selected_data.Y): - selected_data.Y = selected_data.Y.toarray() - if sp.issparse(selected_data.metas): - selected_data.metas = selected_data.metas.toarray() + selected_data = data[:, attrs].to_dense() return selected_data def _clear_plot_widget(self): diff --git a/Orange/widgets/visualize/owsieve.py b/Orange/widgets/visualize/owsieve.py index 6afebdb7386..1137876eded 100644 --- a/Orange/widgets/visualize/owsieve.py +++ b/Orange/widgets/visualize/owsieve.py @@ -209,7 +209,7 @@ def discretizer(data): discretize = Discretize( method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True) - return discretize(data) + return discretize(data).to_dense() return data if not data.is_sparse() and not init: @@ -219,7 +219,6 @@ def discretizer(data): self.attr_y} new_domain = data.domain.select_columns(attrs) data = Table.from_table(new_domain, data) - data.X = data.X.toarray() return discretizer(data) @Inputs.features diff --git a/Orange/widgets/visualize/tests/test_owscatterplot.py b/Orange/widgets/visualize/tests/test_owscatterplot.py index 4f455920a7f..baec6ef9d0f 100644 --- a/Orange/widgets/visualize/tests/test_owscatterplot.py +++ b/Orange/widgets/visualize/tests/test_owscatterplot.py @@ -317,11 +317,8 @@ def test_sparse(self): GH-2152 GH-2157 """ - table = Table("iris") - table.X = sp.csr_matrix(table.X) - self.assertTrue(sp.issparse(table.X)) - table.Y = sp.csr_matrix(table._Y) # pylint: disable=protected-access - self.assertTrue(sp.issparse(table.Y)) + table = Table("iris").to_sparse(sparse_attributes=True, + sparse_class=True) self.send_signal(self.widget.Inputs.data, table) self.widget.set_subset_data(table[:30]) data = self.get_output("Data") diff --git a/Orange/widgets/visualize/tests/test_owsieve.py b/Orange/widgets/visualize/tests/test_owsieve.py index 2280ba1ef0d..a8e72b44c5f 100644 --- a/Orange/widgets/visualize/tests/test_owsieve.py +++ b/Orange/widgets/visualize/tests/test_owsieve.py @@ -3,7 +3,6 @@ from math import isnan from unittest.mock import patch import numpy as np -import scipy.sparse as sp from AnyQt.QtCore import QEvent, QPoint, Qt from AnyQt.QtGui import QMouseEvent @@ -100,7 +99,7 @@ def test_sparse_data(self): output = self.get_output("Data") self.assertFalse(output.is_sparse()) - table.X = sp.csr_matrix(table.X) + table = table.to_sparse() self.send_signal(self.widget.Inputs.data, table) self.assertEqual(len(self.widget.discrete_data.domain), 2) output = self.get_output("Data")