From 951fd8ebcc71f413e112b48cddce133b590f4eb2 Mon Sep 17 00:00:00 2001 From: nikicc Date: Thu, 25 May 2017 12:39:20 +0200 Subject: [PATCH 01/14] DomainConversion: Add sparsity suggestions DomainConversion now has three more attributes `sparse_X`, `sparse_Y` and `sparse_metas`, which suggest whether the resulting matrix should be sparse or dense. --- Orange/data/domain.py | 27 ++++++++++++++++++++++++++ Orange/tests/test_domain.py | 38 ++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/Orange/data/domain.py b/Orange/data/domain.py index bd06436f7f8..e7c6f779cee 100644 --- a/Orange/data/domain.py +++ b/Orange/data/domain.py @@ -44,6 +44,18 @@ class DomainConversion: .. attribute:: metas Indices for meta attributes + + .. attribute:: sparse_X + + Flag whether the resulting X matrix should be sparse. + + .. attribute:: sparse_Y + + Flag whether the resulting Y matrix should be sparse. + + .. attribute:: sparse_metas + + Flag whether the resulting metas matrix should be sparse. """ def __init__(self, source, destination): @@ -63,6 +75,21 @@ def __init__(self, source, destination): source.index(var) if var in source else var.compute_value for var in destination.metas] + def should_be_sparse(feats): + """ + For a matrix to be stored in sparse, more than 2/3 of columns + should be marked as sparse and there should be no string columns + since Scipy's sparse matrices don't support dtype=object. + """ + fraction_sparse = sum(f.sparse for f in feats) / max(len(feats), 1) + contain_strings = any(f.is_string for f in feats) + return fraction_sparse > 2/3 and not contain_strings + + # check whether X, Y or metas should be sparse + self.sparse_X = should_be_sparse(destination.attributes) + self.sparse_Y = should_be_sparse(destination.class_vars) + self.sparse_metas = should_be_sparse(destination.metas) + def filter_visible(feats): """ diff --git a/Orange/tests/test_domain.py b/Orange/tests/test_domain.py index 59de6894819..1207b54a894 100644 --- a/Orange/tests/test_domain.py +++ b/Orange/tests/test_domain.py @@ -3,7 +3,7 @@ import warnings from time import time from numbers import Real -from itertools import starmap +from itertools import starmap, chain import unittest import pickle @@ -502,6 +502,42 @@ def test_copy(self): self.assertEqual(domain[age].number_of_decimals, 5) self.assertEqual(new_domain[age].number_of_decimals, 10) + def test_domain_conversion_sparsity(self): + destination = Domain( + attributes=[ + ContinuousVariable(name='a'), + ContinuousVariable(name='b'), + ContinuousVariable(name='c'), + ], + class_vars=[DiscreteVariable('d', values=['e'])], + metas=[StringVariable('f')] + ) + + # all dense + source = Domain(attributes=[]) + conversion = DomainConversion(source, destination) + self.assertFalse(conversion.sparse_X) + self.assertFalse(conversion.sparse_Y) + self.assertFalse(conversion.sparse_metas) + + # set destination attributes as sparse + for a in destination.attributes: + a.sparse = True + source = Domain(attributes=[]) + conversion = DomainConversion(source, destination) + self.assertTrue(conversion.sparse_X) + self.assertFalse(conversion.sparse_Y) + self.assertFalse(conversion.sparse_metas) + + # set all destination variable as sparse + for a in chain(destination.variables, destination.metas): + a.sparse = True + source = Domain(attributes=[]) + conversion = DomainConversion(source, destination) + self.assertTrue(conversion.sparse_X) + self.assertTrue(conversion.sparse_Y) + self.assertFalse(conversion.sparse_metas) + class TestDomainFilter(unittest.TestCase): def setUp(self): From 7fcdbdead93de9b99b65ff04ed969a1e0479ac08 Mon Sep 17 00:00:00 2001 From: nikicc Date: Thu, 25 May 2017 12:42:49 +0200 Subject: [PATCH 02/14] Table: Set density according to DomainConversion Table.from_table now sets the sparsity of X, Y and metas as determined by DomainConversion. --- Orange/data/table.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Orange/data/table.py b/Orange/data/table.py index 8b4cbbdd32a..f360fb40f02 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -382,18 +382,19 @@ def match_type(x): self.domain = domain conversion = domain.get_conversion(source.domain) self.X = get_columns(row_indices, conversion.attributes, n_rows, - is_sparse=sp.issparse(source.X)) + is_sparse=conversion.sparse_X) if self.X.ndim == 1: self.X = self.X.reshape(-1, len(self.domain.attributes)) + self.Y = get_columns(row_indices, conversion.class_vars, n_rows, - is_sparse=sp.issparse(source.Y)) + is_sparse=conversion.sparse_Y) dtype = np.float64 if any(isinstance(var, StringVariable) for var in domain.metas): dtype = np.object self.metas = get_columns(row_indices, conversion.metas, n_rows, dtype, - is_sparse=sp.issparse(source.metas)) + is_sparse=conversion.sparse_metas) if self.metas.ndim == 1: self.metas = self.metas.reshape(-1, len(self.domain.metas)) if source.has_weights(): From bef9c4b68ab7f62b46443005818016a026cc6dd9 Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 17 Nov 2017 09:28:12 +0100 Subject: [PATCH 03/14] Data.utils: Add methods for assuring sparse/dense --- Orange/data/util.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Orange/data/util.py b/Orange/data/util.py index 01f39831547..c9ae5a60dfb 100644 --- a/Orange/data/util.py +++ b/Orange/data/util.py @@ -89,3 +89,34 @@ def hstack(arrays): return sp.hstack(arrays) else: return np.hstack(arrays) + + +def assure_array_dense(a): + if sp.issparse(a): + a = a.toarray() + return a + + +def assure_array_sparse(a): + if not sp.issparse(a): + # since x can be a list, cast to np.array + # since x can come from metas with string, cast to float + a = np.asarray(a).astype(np.float) + return sp.csc_matrix(a) + return a + + +def assure_column_sparse(a): + a = assure_array_sparse(a) + # if x of shape (n, ) is passed to csc_matrix constructor, + # the resulting matrix is of shape (1, n) and hence we + # need to transpose it to make it a column + if a.shape[0] == 1: + a = a.T + return a + + +def assure_column_dense(a): + a = assure_array_dense(a) + # column assignments must be of shape (n,) and not (n, 1) + return np.ravel(a) From 7fa9f05bfd37861fd4b1c58b1a9eda33c777649c Mon Sep 17 00:00:00 2001 From: nikicc Date: Thu, 25 May 2017 12:44:47 +0200 Subject: [PATCH 04/14] Table.from_table: Obey is_sparse when returning subarrays When we return subarryas, the flag `is_sparse` wasn't considered, but we simpy returned the subarray in it's original format. Also, make sure subarrays aren't flattened to 1d, as it is required for columns. --- Orange/data/table.py | 47 ++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/Orange/data/table.py b/Orange/data/table.py index f360fb40f02..49c2dfd7b95 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -17,7 +17,8 @@ Domain, Variable, Storage, StringVariable, Unknown, Value, Instance, ContinuousVariable, DiscreteVariable, MISSING_VALUES ) -from Orange.data.util import SharedComputeValue, vstack, hstack +from Orange.data.util import SharedComputeValue, vstack, hstack, assure_array_dense, assure_array_sparse, \ + assure_column_dense, assure_column_sparse from Orange.statistics.util import bincount, countnans, contingency, \ stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \ sparse_implicit_zero_weights @@ -280,44 +281,38 @@ def from_table(cls, domain, source, row_indices=...): global _conversion_cache - def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, - is_sparse=False): - + def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, is_sparse=False): if not len(src_cols): if is_sparse: return sp.csr_matrix((n_rows, 0), dtype=source.X.dtype) else: return np.zeros((n_rows, 0), dtype=source.X.dtype) + # match density for subarrays + match_density = assure_array_sparse if is_sparse else assure_array_dense n_src_attrs = len(source.domain.attributes) if all(isinstance(x, Integral) and 0 <= x < n_src_attrs for x in src_cols): - return _subarray(source.X, row_indices, src_cols) + return match_density(_subarray(source.X, row_indices, src_cols)) if all(isinstance(x, Integral) and x < 0 for x in src_cols): - arr = _subarray(source.metas, row_indices, - [-1 - x for x in src_cols]) + arr = match_density(_subarray(source.metas, row_indices, + [-1 - x for x in src_cols])) if arr.dtype != dtype: return arr.astype(dtype) return arr if all(isinstance(x, Integral) and x >= n_src_attrs for x in src_cols): - return _subarray(source._Y, row_indices, - [x - n_src_attrs for x in src_cols]) + return match_density(_subarray( + source._Y, row_indices, + [x - n_src_attrs for x in src_cols])) + # initialize final array & set `match_density` for columns if is_sparse: a = sp.dok_matrix((n_rows, len(src_cols)), dtype=dtype) + match_density = assure_column_sparse else: a = np.empty((n_rows, len(src_cols)), dtype=dtype) - - def match_type(x): - """ Assure that matrix and column are both dense or sparse. """ - if is_sparse == sp.issparse(x): - return x - elif is_sparse: - x = np.asarray(x) - return sp.csc_matrix(x.reshape(-1, 1).astype(np.float)) - else: - return np.ravel(x.toarray()) + match_density = assure_column_dense shared_cache = _conversion_cache for i, col in enumerate(src_cols): @@ -330,22 +325,22 @@ def match_type(x): col.compute_shared(source) shared = shared_cache[id(col.compute_shared), id(source)] if row_indices is not ...: - a[:, i] = match_type( + a[:, i] = match_density( col(source, shared_data=shared)[row_indices]) else: - a[:, i] = match_type( + a[:, i] = match_density( col(source, shared_data=shared)) else: if row_indices is not ...: - a[:, i] = match_type(col(source)[row_indices]) + a[:, i] = match_density(col(source)[row_indices]) else: - a[:, i] = match_type(col(source)) + a[:, i] = match_density(col(source)) elif col < 0: - a[:, i] = match_type(source.metas[row_indices, -1 - col]) + a[:, i] = match_density(source.metas[row_indices, -1 - col]) elif col < n_src_attrs: - a[:, i] = match_type(source.X[row_indices, col]) + a[:, i] = match_density(source.X[row_indices, col]) else: - a[:, i] = match_type( + a[:, i] = match_density( source._Y[row_indices, col - n_src_attrs]) if is_sparse: From 6f004ea547a0cb3b727b7621e72a271e5e8dfaa8 Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 3 Nov 2017 15:07:57 +0100 Subject: [PATCH 05/14] Table: Consider sparsity flags when domain match --- Orange/data/table.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Orange/data/table.py b/Orange/data/table.py index 49c2dfd7b95..85fc2813226 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -361,6 +361,8 @@ def get_columns(row_indices, src_cols, n_rows, dtype=np.float64, is_sparse=False table = cls.from_table_rows(source, row_indices) # assure resulting domain is the instance passed on input table.domain = domain + # since sparse flags are not considered when checking for domain equality, fix manually. + table = assure_domain_conversion_sparsity(table, source) return table if isinstance(row_indices, slice): @@ -1739,3 +1741,23 @@ def _rxc_ix(rows, cols): else: r, c = np.ix_(rows, cols) return np.asarray(r, int), np.asarray(c, int) + + +def assure_domain_conversion_sparsity(target, source): + """ + Assure that the table obeys the domain conversion's suggestions about sparsity. + + Args: + target (Table): the target table. + source (Table): the source table. + + Returns: + Table: with fixed sparsity. The sparsity is set as it is recommended by domain conversion + for transformation from source to the target domain. + """ + conversion = target.domain.get_conversion(source.domain) + match_density = [assure_array_dense, assure_array_sparse] + target.X = match_density[conversion.sparse_X](target.X) + target.Y = match_density[conversion.sparse_Y](target.Y) + target.metas = match_density[conversion.sparse_metas](target.metas) + return target From 332af4e08f3beca4bb52149f94518218d709b394 Mon Sep 17 00:00:00 2001 From: nikicc Date: Thu, 13 Jul 2017 17:15:52 +0200 Subject: [PATCH 06/14] Table: Add to_sparse and to_dense methods --- Orange/data/table.py | 34 ++++++++++++++++++++++++++++++++++ Orange/tests/test_table.py | 22 ++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/Orange/data/table.py b/Orange/data/table.py index 85fc2813226..a7331e606f5 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -1649,6 +1649,40 @@ def guessed_var(i, var_name): self.attributes["old_domain"] = table.domain return self + def to_sparse(self, sparse_attributes=True, sparse_class=False, + sparse_metas=False): + def sparsify(features): + for f in features: + f.sparse = True + + new_domain = self.domain.copy() + + if sparse_attributes: + sparsify(new_domain.attributes) + if sparse_class: + sparsify(new_domain.class_vars) + if sparse_metas: + sparsify(new_domain.metas) + return self.transform(new_domain) + + def to_dense(self, dense_attributes=True, dense_class=True, + dense_metas=True): + def densify(features): + for f in features: + f.sparse = False + + new_domain = self.domain.copy() + + if dense_attributes: + densify(new_domain.attributes) + if dense_class: + densify(new_domain.class_vars) + if dense_metas: + densify(new_domain.metas) + t = self.transform(new_domain) + t.ids = self.ids # preserve indices + return t + def _check_arrays(*arrays, dtype=None): checked = [] diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index d48a802be8d..75db709246e 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -2705,6 +2705,28 @@ def _compare_tables(self, table1, table2): for x in table2.domain.metas]) +class TestTableSparseDenseTransformations(unittest.TestCase): + def setUp(self): + self.iris = Table('iris') + + def test_conversion(self): + iris = Table('iris') + iris_sparse = iris.to_sparse(sparse_attributes=True) + self.assertTrue(sp.issparse(iris_sparse.X)) + self.assertFalse(sp.issparse(iris_sparse.Y)) + self.assertFalse(sp.issparse(iris_sparse.metas)) + + iris_sparse = iris.to_sparse(sparse_attributes=True, sparse_class=True) + self.assertTrue(sp.issparse(iris_sparse.X)) + self.assertTrue(sp.issparse(iris_sparse.Y)) + self.assertFalse(sp.issparse(iris_sparse.metas)) + + dense_iris = iris_sparse.to_dense() + self.assertFalse(sp.issparse(dense_iris.X)) + self.assertFalse(sp.issparse(dense_iris.Y)) + self.assertFalse(sp.issparse(dense_iris.metas)) + + if __name__ == "__main__": unittest.main() From 204c2d45cc725bd886175ff0635abb0047cfa820 Mon Sep 17 00:00:00 2001 From: nikicc Date: Tue, 18 Jul 2017 17:26:31 +0200 Subject: [PATCH 07/14] Table._check_arrays: Don't use len on sparse matrices Calling len on scipy sparse matrices causes the error: `TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]`. --- Orange/data/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Orange/data/table.py b/Orange/data/table.py index a7331e606f5..db1312fe48e 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -1704,7 +1704,7 @@ def ninstances(array): if ninstances(array) != shape_1: raise ValueError("Leading dimension mismatch (%d != %d)" - % (len(array), shape_1)) + % (ninstances(array), shape_1)) if sp.issparse(array): array.data = np.asarray(array.data) From 228afd4ae06e1ba437fe422bec6dadc1e3c066bf Mon Sep 17 00:00:00 2001 From: nikicc Date: Tue, 18 Jul 2017 17:37:58 +0200 Subject: [PATCH 08/14] TableModel: Fix showing sparse metas --- Orange/widgets/utils/itemmodels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Orange/widgets/utils/itemmodels.py b/Orange/widgets/utils/itemmodels.py index c99c5fa4cf4..b135f55550b 100644 --- a/Orange/widgets/utils/itemmodels.py +++ b/Orange/widgets/utils/itemmodels.py @@ -1105,7 +1105,7 @@ def make_basket_formater(vars, density, role): elif role == TableModel.ClassVar: getter = operator.attrgetter("sparse_y") elif role == TableModel.Meta: - getter = operator.attrgetter("sparse_meta") + getter = operator.attrgetter("sparse_metas") return partial(formater, vars, getter) def make_basket(vars, density, role): From e57f340f5c7da650ab781db2a50fa4e7c87141ff Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 3 Nov 2017 13:45:11 +0100 Subject: [PATCH 09/14] Tests: Fixups according to improved sparsity --- Orange/tests/test_normalize.py | 6 +++--- Orange/tests/test_remove.py | 4 +--- Orange/tests/test_table.py | 18 +++++++----------- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/Orange/tests/test_normalize.py b/Orange/tests/test_normalize.py index b6c42239015..415a6a8854f 100644 --- a/Orange/tests/test_normalize.py +++ b/Orange/tests/test_normalize.py @@ -98,11 +98,11 @@ def test_normalize_transform_by_span_zero_class(self): def test_normalize_sparse(self): domain = Domain([ContinuousVariable(str(i)) for i in range(3)]) # pylint: disable=bad-whitespace - X = sp.csr_matrix(np.array([ + X = np.array([ [0, -1, -2], [0, 1, 2], - ])) - data = Table.from_numpy(domain, X) + ]) + data = Table.from_numpy(domain, X).to_sparse() # pylint: disable=bad-whitespace solution = sp.csr_matrix(np.array([ diff --git a/Orange/tests/test_remove.py b/Orange/tests/test_remove.py index 759f1bf3e92..29dab16932e 100644 --- a/Orange/tests/test_remove.py +++ b/Orange/tests/test_remove.py @@ -4,7 +4,6 @@ import unittest import numpy as np -import scipy.sparse as sp from Orange.data import Table from Orange.preprocess import Remove @@ -138,8 +137,7 @@ def test_remove_unused_values_metas(self): def test_remove_unused_values_attr_sparse(self): data = self.test8 - data = data[1:] - data.X = sp.csr_matrix(data.X) + data = data[1:].to_sparse() remover = Remove(Remove.RemoveUnusedValues) new_data = remover(data) attr_res = remover.attr_results diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index 75db709246e..93241588b44 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -640,8 +640,7 @@ def test_copy(self): self.assertFalse(np.all(t.metas == copy.metas)) def test_copy_sparse(self): - t = data.Table('iris') - t.X = sp.csr_matrix(t.X) + t = data.Table('iris').to_sparse() copy = t.copy() self.assertEqual((t.X != copy.X).nnz, 0) # sparse matrices match by content @@ -1845,8 +1844,7 @@ def test_creates_table_with_given_domain_and_row_filter(self): new_table, self.table[:0], xcols=order, ycols=order, mcols=order) def test_from_table_sparse_move_some_to_empty_metas(self): - iris = data.Table("iris") - iris.X = sp.csr_matrix(iris.X) + iris = data.Table("iris").to_sparse() new_domain = data.domain.Domain( iris.domain.attributes[:2], iris.domain.class_vars, iris.domain.attributes[2:], source=iris.domain) @@ -1861,19 +1859,18 @@ def test_from_table_sparse_move_some_to_empty_metas(self): back_iris = data.Table.from_table(iris.domain, new_iris) self.assertEqual(back_iris.domain, iris.domain) self.assertTrue(sp.issparse(back_iris.X)) - self.assertTrue(sp.issparse(back_iris.metas)) + self.assertFalse(sp.issparse(back_iris.metas)) self.assertEqual(back_iris.X.shape, iris.X.shape) self.assertEqual(back_iris.metas.shape, iris.metas.shape) def test_from_table_sparse_move_all_to_empty_metas(self): - iris = data.Table("iris") - iris.X = sp.csr_matrix(iris.X) + iris = data.Table("iris").to_sparse() new_domain = data.domain.Domain( [], iris.domain.class_vars, iris.domain.attributes, source=iris.domain) new_iris = data.Table.from_table(new_domain, iris) - self.assertTrue(sp.issparse(new_iris.X)) + self.assertFalse(sp.issparse(new_iris.X)) self.assertTrue(sp.issparse(new_iris.metas)) self.assertEqual(new_iris.X.shape, (len(iris), 0)) self.assertEqual(new_iris.metas.shape, (len(iris), 4)) @@ -1882,13 +1879,12 @@ def test_from_table_sparse_move_all_to_empty_metas(self): back_iris = data.Table.from_table(iris.domain, new_iris) self.assertEqual(back_iris.domain, iris.domain) self.assertTrue(sp.issparse(back_iris.X)) - self.assertTrue(sp.issparse(back_iris.metas)) + self.assertFalse(sp.issparse(back_iris.metas)) self.assertEqual(back_iris.X.shape, iris.X.shape) self.assertEqual(back_iris.metas.shape, iris.metas.shape) def test_from_table_sparse_move_to_nonempty_metas(self): - brown = data.Table("brown-selected") - brown.X = sp.csr_matrix(brown.X) + brown = data.Table("brown-selected").to_sparse() n_attr = len(brown.domain.attributes) n_metas = len(brown.domain.metas) new_domain = data.domain.Domain( From a9bd8cacf34c5f86312db03d39bc89beecfc39ae Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 3 Nov 2017 13:46:09 +0100 Subject: [PATCH 10/14] TestTable: Use transform instead of trom_table --- Orange/tests/test_table.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index 93241588b44..19f7c0687d7 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -1848,7 +1848,7 @@ def test_from_table_sparse_move_some_to_empty_metas(self): new_domain = data.domain.Domain( iris.domain.attributes[:2], iris.domain.class_vars, iris.domain.attributes[2:], source=iris.domain) - new_iris = data.Table.from_table(new_domain, iris) + new_iris = iris.transform(new_domain) self.assertTrue(sp.issparse(new_iris.X)) self.assertTrue(sp.issparse(new_iris.metas)) @@ -1856,7 +1856,7 @@ def test_from_table_sparse_move_some_to_empty_metas(self): self.assertEqual(new_iris.metas.shape, (len(iris), 2)) # move back - back_iris = data.Table.from_table(iris.domain, new_iris) + back_iris = new_iris.transform(iris.domain) self.assertEqual(back_iris.domain, iris.domain) self.assertTrue(sp.issparse(back_iris.X)) self.assertFalse(sp.issparse(back_iris.metas)) @@ -1868,7 +1868,7 @@ def test_from_table_sparse_move_all_to_empty_metas(self): new_domain = data.domain.Domain( [], iris.domain.class_vars, iris.domain.attributes, source=iris.domain) - new_iris = data.Table.from_table(new_domain, iris) + new_iris = iris.transform(new_domain) self.assertFalse(sp.issparse(new_iris.X)) self.assertTrue(sp.issparse(new_iris.metas)) @@ -1876,7 +1876,7 @@ def test_from_table_sparse_move_all_to_empty_metas(self): self.assertEqual(new_iris.metas.shape, (len(iris), 4)) # move back - back_iris = data.Table.from_table(iris.domain, new_iris) + back_iris = new_iris.transform(iris.domain) self.assertEqual(back_iris.domain, iris.domain) self.assertTrue(sp.issparse(back_iris.X)) self.assertFalse(sp.issparse(back_iris.metas)) From a0935fa84ae722451579636b0377f7606068e158 Mon Sep 17 00:00:00 2001 From: nikicc Date: Fri, 14 Jul 2017 16:46:14 +0200 Subject: [PATCH 11/14] WidgetTest, WidgetOutputsTestMixin: Reset variables' cache ... to alleviate interdependencies between tests in differente files. Before that, one tests could mark some attributes of iris as sparse which could cause tests in different files to crash due to variable "reusing". --- Orange/widgets/tests/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Orange/widgets/tests/base.py b/Orange/widgets/tests/base.py index 5db342a42e5..42b4d940aa1 100644 --- a/Orange/widgets/tests/base.py +++ b/Orange/widgets/tests/base.py @@ -24,7 +24,8 @@ from Orange.classification.base_classification import ( LearnerClassification, ModelClassification ) -from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable +from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable,\ + Variable from Orange.modelling import Fitter from Orange.preprocess import RemoveNaNColumns, Randomize from Orange.preprocess.preprocess import PreprocessorList @@ -100,6 +101,7 @@ def setUpClass(cls): report = OWReport() cls.widgets.append(report) OWReport.get_instance = lambda: report + Variable._clear_all_caches() def tearDown(self): """Process any pending events before the next test is executed.""" @@ -687,6 +689,7 @@ class WidgetOutputsTestMixin: """ def init(self): + Variable._clear_all_caches() self.data = Table("iris") self.same_input_output_domain = True From 8ed204d1cee73c9c0f01f3306b3df055ae670dce Mon Sep 17 00:00:00 2001 From: nikicc Date: Tue, 18 Jul 2017 17:20:23 +0200 Subject: [PATCH 12/14] Gui: Fixups according to improved sparsity --- Orange/widgets/data/tests/test_owfile.py | 3 +-- Orange/widgets/data/tests/test_owmergedata.py | 3 +-- Orange/widgets/model/tests/test_tree.py | 7 ++----- .../widgets/unsupervised/tests/test_owmanifoldlearning.py | 3 +-- Orange/widgets/visualize/owheatmap.py | 3 +-- Orange/widgets/visualize/owscatterplotgraph.py | 8 +------- Orange/widgets/visualize/owsieve.py | 3 +-- Orange/widgets/visualize/tests/test_owscatterplot.py | 7 ++----- Orange/widgets/visualize/tests/test_owsieve.py | 3 +-- 9 files changed, 11 insertions(+), 29 deletions(-) diff --git a/Orange/widgets/data/tests/test_owfile.py b/Orange/widgets/data/tests/test_owfile.py index fce9f8299dc..bf3e7a5fa67 100644 --- a/Orange/widgets/data/tests/test_owfile.py +++ b/Orange/widgets/data/tests/test_owfile.py @@ -308,8 +308,7 @@ def test_no_specified_reader(self): self.assertTrue(self.widget.Error.missing_reader.is_shown()) def test_domain_edit_on_sparse_data(self): - iris = Table("iris") - iris.X = sp.csr_matrix(iris.X) + iris = Table("iris").to_sparse() f = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False) pickle.dump(iris, f) diff --git a/Orange/widgets/data/tests/test_owmergedata.py b/Orange/widgets/data/tests/test_owmergedata.py index f125c1fc38a..9348e0c5c1d 100644 --- a/Orange/widgets/data/tests/test_owmergedata.py +++ b/Orange/widgets/data/tests/test_owmergedata.py @@ -435,8 +435,7 @@ def test_sparse(self): """ data = Table("iris")[::25] data_ed_dense = Table("titanic")[::300] - data_ed_sparse = Table("titanic")[::300] - data_ed_sparse.X = sp.csr_matrix(data_ed_sparse.X) + data_ed_sparse = Table("titanic")[::300].to_sparse() self.send_signal("Data", data) self.send_signal("Extra Data", data_ed_dense) diff --git a/Orange/widgets/model/tests/test_tree.py b/Orange/widgets/model/tests/test_tree.py index 18f48061892..2f3d919d39b 100644 --- a/Orange/widgets/model/tests/test_tree.py +++ b/Orange/widgets/model/tests/test_tree.py @@ -1,6 +1,5 @@ # pylint: disable=protected-access import numpy as np -import scipy.sparse as sp from Orange.base import Model from Orange.data import Table @@ -49,8 +48,7 @@ def test_sparse_data_classification(self): table1 = Table("iris") self.send_signal("Data", table1) model_dense = self.get_output("Model") - table2 = Table("iris") - table2.X = sp.csr_matrix(table2.X) + table2 = Table("iris").to_sparse() self.send_signal("Data", table2) model_sparse = self.get_output("Model") self.assertTrue(np.array_equal(model_dense._code, model_sparse._code)) @@ -64,8 +62,7 @@ def test_sparse_data_regression(self): table1 = Table("housing") self.send_signal("Data", table1) model_dense = self.get_output("Model") - table2 = Table("housing") - table2.X = sp.csr_matrix(table2.X) + table2 = Table("housing").to_sparse() self.send_signal("Data", table2) model_sparse = self.get_output("Model") self.assertTrue(np.array_equal(model_dense._code, model_sparse._code)) diff --git a/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py b/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py index 7578637a357..bdca5b1ac48 100644 --- a/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py +++ b/Orange/widgets/unsupervised/tests/test_owmanifoldlearning.py @@ -66,8 +66,7 @@ def _compare_tables(self, _output, n_components): np.testing.assert_array_equal(self.iris.metas, _output.metas) def test_sparse_data(self): - data = Table("iris") - data.X = sparse.csr_matrix(data.X) + data = Table("iris").to_sparse() self.assertTrue(sparse.issparse(data.X)) self.widget.manifold_method_index = 2 self.send_signal(self.widget.Inputs.data, data) diff --git a/Orange/widgets/visualize/owheatmap.py b/Orange/widgets/visualize/owheatmap.py index 982ed57308f..0c515c4bb97 100644 --- a/Orange/widgets/visualize/owheatmap.py +++ b/Orange/widgets/visualize/owheatmap.py @@ -663,8 +663,7 @@ def set_dataset(self, data=None): if data is not None and sp.issparse(data.X): try: - data = data.copy() - data.X = data.X.toarray() + data = data.to_dense() except MemoryError: data = None self.Error.not_enough_memory() diff --git a/Orange/widgets/visualize/owscatterplotgraph.py b/Orange/widgets/visualize/owscatterplotgraph.py index e60265a0b2a..38f95e718e0 100644 --- a/Orange/widgets/visualize/owscatterplotgraph.py +++ b/Orange/widgets/visualize/owscatterplotgraph.py @@ -623,13 +623,7 @@ def sparse_to_dense(self): domain = data.domain all_attrs = domain.variables + domain.metas attrs = list(set(all_attrs) & attrs) - selected_data = data[:, attrs] - if sp.issparse(selected_data.X): - selected_data.X = selected_data.X.toarray() - if sp.issparse(selected_data.Y): - selected_data.Y = selected_data.Y.toarray() - if sp.issparse(selected_data.metas): - selected_data.metas = selected_data.metas.toarray() + selected_data = data[:, attrs].to_dense() return selected_data def _clear_plot_widget(self): diff --git a/Orange/widgets/visualize/owsieve.py b/Orange/widgets/visualize/owsieve.py index 6afebdb7386..1137876eded 100644 --- a/Orange/widgets/visualize/owsieve.py +++ b/Orange/widgets/visualize/owsieve.py @@ -209,7 +209,7 @@ def discretizer(data): discretize = Discretize( method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True) - return discretize(data) + return discretize(data).to_dense() return data if not data.is_sparse() and not init: @@ -219,7 +219,6 @@ def discretizer(data): self.attr_y} new_domain = data.domain.select_columns(attrs) data = Table.from_table(new_domain, data) - data.X = data.X.toarray() return discretizer(data) @Inputs.features diff --git a/Orange/widgets/visualize/tests/test_owscatterplot.py b/Orange/widgets/visualize/tests/test_owscatterplot.py index 4f455920a7f..baec6ef9d0f 100644 --- a/Orange/widgets/visualize/tests/test_owscatterplot.py +++ b/Orange/widgets/visualize/tests/test_owscatterplot.py @@ -317,11 +317,8 @@ def test_sparse(self): GH-2152 GH-2157 """ - table = Table("iris") - table.X = sp.csr_matrix(table.X) - self.assertTrue(sp.issparse(table.X)) - table.Y = sp.csr_matrix(table._Y) # pylint: disable=protected-access - self.assertTrue(sp.issparse(table.Y)) + table = Table("iris").to_sparse(sparse_attributes=True, + sparse_class=True) self.send_signal(self.widget.Inputs.data, table) self.widget.set_subset_data(table[:30]) data = self.get_output("Data") diff --git a/Orange/widgets/visualize/tests/test_owsieve.py b/Orange/widgets/visualize/tests/test_owsieve.py index 2280ba1ef0d..a8e72b44c5f 100644 --- a/Orange/widgets/visualize/tests/test_owsieve.py +++ b/Orange/widgets/visualize/tests/test_owsieve.py @@ -3,7 +3,6 @@ from math import isnan from unittest.mock import patch import numpy as np -import scipy.sparse as sp from AnyQt.QtCore import QEvent, QPoint, Qt from AnyQt.QtGui import QMouseEvent @@ -100,7 +99,7 @@ def test_sparse_data(self): output = self.get_output("Data") self.assertFalse(output.is_sparse()) - table.X = sp.csr_matrix(table.X) + table = table.to_sparse() self.send_signal(self.widget.Inputs.data, table) self.assertEqual(len(self.widget.discrete_data.domain), 2) output = self.get_output("Data") From 7af1181ea3cea013f807370721f1dab51c4c1810 Mon Sep 17 00:00:00 2001 From: nikicc Date: Thu, 9 Nov 2017 14:50:35 +0100 Subject: [PATCH 13/14] Add tests Table.from_table sparsity hadling --- Orange/tests/test_table.py | 57 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index 19f7c0687d7..6b33ca89822 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -2701,11 +2701,16 @@ def _compare_tables(self, table1, table2): for x in table2.domain.metas]) -class TestTableSparseDenseTransformations(unittest.TestCase): +class SparseCV: + def __call__(self, data): + return sp.csr_matrix((len(data), 1)) + + +class TestTableSparseDense(unittest.TestCase): def setUp(self): self.iris = Table('iris') - def test_conversion(self): + def test_sparse_dense_transformation(self): iris = Table('iris') iris_sparse = iris.to_sparse(sparse_attributes=True) self.assertTrue(sp.issparse(iris_sparse.X)) @@ -2722,6 +2727,54 @@ def test_conversion(self): self.assertFalse(sp.issparse(dense_iris.Y)) self.assertFalse(sp.issparse(dense_iris.metas)) + def test_from_table_add_one_sparse_column(self): + # add one sparse feature, should remain dense + domain = self.iris.domain.copy() + domain.attributes += ( + ContinuousVariable('S1', compute_value=SparseCV(), sparse=True), + ) + d = self.iris.transform(domain) + self.assertFalse(sp.issparse(d.X)) + + def test_from_table_add_lots_of_sparse_columns(self): + n_attrs = len(self.iris.domain.attributes) + + # add 2*n_attrs+1 sparse feature, should became sparse + domain = self.iris.domain.copy() + domain.attributes += tuple( + ContinuousVariable('S' + str(i), compute_value=SparseCV(), sparse=True) + for i in range(2*n_attrs + 1) + ) + d = self.iris.transform(domain) + self.assertTrue(sp.issparse(d.X)) + + def test_from_table_replace_attrs_with_sparse(self): + # replace attrs with a sparse feature, should became sparse + domain = self.iris.domain.copy() + domain.attributes = ( + ContinuousVariable('S1', compute_value=SparseCV(), sparse=True), + ) + d = self.iris.transform(domain) + self.assertTrue(sp.issparse(d.X)) + + def test_from_table_sparse_metas(self): + # replace metas with a sparse feature, should became sparse + domain = self.iris.domain.copy() + domain._metas = ( + ContinuousVariable('S1', compute_value=SparseCV(), sparse=True), + ) + d = self.iris.transform(domain) + self.assertTrue(sp.issparse(d.metas)) + + def test_from_table_sparse_metas_with_strings(self): + # replace metas with text and 100 sparse features, should be dense + domain = self.iris.domain.copy() + domain._metas = (StringVariable('text'),) + tuple( + ContinuousVariable('S' + str(i), compute_value=SparseCV(), sparse=True) + for i in range(100) + ) + d = self.iris.transform(domain) + self.assertFalse(sp.issparse(d.metas)) if __name__ == "__main__": unittest.main() From 4bf6a206846bb8101579faa23204d39dd13ca05f Mon Sep 17 00:00:00 2001 From: nikicc Date: Tue, 18 Jul 2017 23:47:47 +0200 Subject: [PATCH 14/14] PyLint --- Orange/preprocess/discretize.py | 2 +- Orange/preprocess/impute.py | 2 +- Orange/tests/test_domain.py | 9 +++++---- Orange/tests/test_remove.py | 1 + Orange/tests/test_table.py | 14 +++++++------- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/Orange/preprocess/discretize.py b/Orange/preprocess/discretize.py index 0878ba11f7e..30e827c3e73 100644 --- a/Orange/preprocess/discretize.py +++ b/Orange/preprocess/discretize.py @@ -1,7 +1,7 @@ import numpy as np import scipy.sparse as sp -from Orange.data import DiscreteVariable, Domain, Table +from Orange.data import DiscreteVariable, Domain from Orange.data.sql.table import SqlTable from Orange.preprocess.util import _RefuseDataInConstructor from Orange.statistics import distribution, contingency diff --git a/Orange/preprocess/impute.py b/Orange/preprocess/impute.py index a37568e02ea..68f968891a2 100644 --- a/Orange/preprocess/impute.py +++ b/Orange/preprocess/impute.py @@ -157,7 +157,7 @@ def __call__(self, data): column = np.array([float(data[self.variable])]) else: column = np.array(data.get_column_view(self.variable)[0], - copy=True) + copy=True) mask = np.isnan(column) if not np.any(mask): diff --git a/Orange/tests/test_domain.py b/Orange/tests/test_domain.py index 1207b54a894..9deef63176c 100644 --- a/Orange/tests/test_domain.py +++ b/Orange/tests/test_domain.py @@ -11,7 +11,7 @@ from numpy.testing import assert_array_equal from Orange.data import ( - ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable, + ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable, Variable, Domain, Table, DomainConversion) from Orange.data.domain import filter_visible from Orange.preprocess import Continuize, Impute @@ -165,7 +165,8 @@ def test_from_numpy_values(self): (0, 2, DiscreteVariable), (18, 23, ContinuousVariable)]: n_rows, n_cols, = aran_max - aran_min, 1 - d = Domain.from_numpy(np.zeros((1, 1)), np.arange(aran_min, aran_max).reshape(n_rows, n_cols)) + d = Domain.from_numpy(np.zeros((1, 1)), + np.arange(aran_min, aran_max).reshape(n_rows, n_cols)) self.assertTrue(d.anonymous) self.assertIsInstance(d.class_var, vartype) if isinstance(vartype, DiscreteVariable): @@ -402,14 +403,14 @@ def test_conversion(self): assert_array_equal(y, np.array([0])) metas_exp = [gender.Unknown, education.Unknown, ssn.Unknown] - def eq(a, b): + def equal(a, b): if isinstance(a, Real) and isinstance(b, Real) and \ np.isnan(a) and np.isnan(b): return True else: return a == b - self.assertTrue(all(starmap(eq, zip(metas, metas_exp)))) + self.assertTrue(all(starmap(equal, zip(metas, metas_exp)))) x, y, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"]) assert_array_equal(x, np.array([42, 13])) diff --git a/Orange/tests/test_remove.py b/Orange/tests/test_remove.py index 29dab16932e..3b70931062a 100644 --- a/Orange/tests/test_remove.py +++ b/Orange/tests/test_remove.py @@ -9,6 +9,7 @@ from Orange.preprocess import Remove from Orange.tests import test_filename + class TestRemover(unittest.TestCase): @classmethod diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index 6b33ca89822..aca641b1d67 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -1777,7 +1777,7 @@ def test_can_filter_row_with_slice(self): new_table, self.table, rows=slice_) def test_can_use_attributes_as_new_columns(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) order = [random.randrange(a) for _ in self.domain.attributes] new_attributes = [self.domain.attributes[i] for i in order] new_domain = self.create_domain( @@ -1788,7 +1788,7 @@ def test_can_use_attributes_as_new_columns(self): new_table, self.table, xcols=order, ycols=order, mcols=order) def test_can_use_class_vars_as_new_columns(self): - a, c, m = column_sizes(self.table) + a, c, _ = column_sizes(self.table) order = [random.randrange(a, a + c) for _ in self.domain.class_vars] new_classes = [self.domain.class_vars[i - a] for i in order] new_domain = self.create_domain(new_classes, new_classes, new_classes) @@ -1798,7 +1798,7 @@ def test_can_use_class_vars_as_new_columns(self): new_table, self.table, xcols=order, ycols=order, mcols=order) def test_can_use_metas_as_new_columns(self): - a, c, m = column_sizes(self.table) + _, _, m = column_sizes(self.table) order = [random.randrange(-m + 1, 0) for _ in self.domain.metas] new_metas = [self.domain.metas[::-1][i] for i in order] new_domain = self.create_domain(new_metas, new_metas, new_metas) @@ -2049,7 +2049,7 @@ def test_can_assign_values(self): self.assertAlmostEqual(self.table.X[0, 0], 42.) def test_can_assign_values_to_classes(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) self.table[0, a] = 42. self.assertAlmostEqual(self.table.Y[0], 42.) @@ -2067,7 +2067,7 @@ def test_can_assign_rows_to_rows(self): self.table.metas[0], self.table.metas[1]) def test_can_assign_lists(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) new_example = [float(i) for i in range(len(self.attributes + self.class_vars))] self.table[0] = new_example @@ -2077,7 +2077,7 @@ def test_can_assign_lists(self): self.table.Y[0], np.array(new_example[a:])) def test_can_assign_np_array(self): - a, c, m = column_sizes(self.table) + a, _, _ = column_sizes(self.table) new_example = \ np.array([float(i) for i in range(len(self.attributes + self.class_vars))]) @@ -2199,7 +2199,7 @@ def test_delete_rows(self): def test_clear(self): self.table.clear() self.assertEqual(len(self.table), 0) - for i in self.table: + for _ in self.table: self.fail("Table should not contain any rows.") def test_subclasses(self):