From 6ae06fcc73392bab1dd83ef458a1f9c0868db3e6 Mon Sep 17 00:00:00 2001 From: janezd Date: Tue, 9 Feb 2021 20:19:32 +0100 Subject: [PATCH 1/2] Table: Add methods 'join' and 'with_column' --- Orange/data/table.py | 89 +++++++++++++++++++++- Orange/data/tests/test_table.py | 128 ++++++++++++++++++++++++++++++++ Orange/tests/test_table.py | 1 - 3 files changed, 214 insertions(+), 4 deletions(-) diff --git a/Orange/data/table.py b/Orange/data/table.py index 6e057d288b7..059148a5c96 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -897,7 +897,33 @@ def __repr__(self): @classmethod def concatenate(cls, tables, axis=0): - """Concatenate tables into a new table""" + """ + Concatenate tables into a new table, either horizontally or vertically. + + If axis=0 (horizontal concatenate), all tables must have the same domain. + + If axis=1 (vertical), + - all variable names must be unique. + - ids are copied from the first table. + - weights are copied from the first table in which they are defined. + - the dictionary of table's attributes are merged. If the same attribute + appears in multiple dictionaries, the earlier are used. + + Args: + tables (Table): tables to be joined + + Returns: + table (Table) + """ + if axis == 0: + return cls._concatenate_vertical(tables) + elif axis == 1: + return cls._concatenate_horizontal(tables) + else: + raise ValueError("invalid axis") + + @classmethod + def _concatenate_vertical(cls, tables): def vstack(arrs): return [np, sp][any(sp.issparse(arr) for arr in arrs)].vstack(arrs) @@ -915,8 +941,6 @@ def merge1d(arrs): def collect(attr): return [getattr(arr, attr) for arr in tables] - if axis == 1: - raise ValueError("concatenate no longer supports axis 1") if not tables: raise ValueError('need at least one table to concatenate') if len(tables) == 1: @@ -942,6 +966,65 @@ def collect(attr): conc.attributes.update(table.attributes) return conc + @classmethod + def _concatenate_horizontal(cls, tables): + """ + """ + if not tables: + raise ValueError('need at least one table to join') + + def all_of(objs, names): + return (tuple(getattr(obj, name) for obj in objs) + for name in names) + + def stack(arrs): + non_empty = tuple(arr if arr.ndim == 2 else arr[:, np.newaxis] + for arr in arrs + if arr is not None and arr.size > 0) + return np.hstack(non_empty) if non_empty else None + + doms, Ws, table_attrss = all_of(tables, ("domain", "W", "attributes")) + Xs, Ys, Ms = map(stack, all_of(tables, ("X", "Y", "metas"))) + # pylint: disable=undefined-loop-variable + for W in Ws: + if W.size: + break + + parts = all_of(doms, ("attributes", "class_vars", "metas")) + domain = Domain(*(tuple(chain(*lst)) for lst in parts)) + table = cls.from_numpy(domain, Xs, Ys, Ms, W, ids=tables[0].ids) + for ta in reversed(table_attrss): + table.attributes.update(ta) + + return table + + def add_column(self, variable, data, to_metas=None): + """ + Create a new table with an additional column + + Column's name must be unique + + Args: + variable (Variable): variable for the new column + data (np.ndarray): data for the new column + to_metas (bool, optional): if `True` the column is added as meta + column. Otherwise, primitive variables are added to attributes + and non-primitive to metas. + + Returns: + table (Table): a new table with the additional column + """ + dom = self.domain + attrs, classes, metas = dom.attributes, dom.class_vars, dom.metas + if to_metas or not variable.is_primitive(): + metas += (variable, ) + else: + attrs += (variable, ) + domain = Domain(attrs, classes, metas) + new_table = self.transform(domain) + new_table.get_column_view(variable)[0][:] = data + return new_table + def is_view(self): """ Return `True` if all arrays represent a view referring to another table diff --git a/Orange/data/tests/test_table.py b/Orange/data/tests/test_table.py index 4ed53145389..0c4bfbb0ac1 100644 --- a/Orange/data/tests/test_table.py +++ b/Orange/data/tests/test_table.py @@ -108,6 +108,134 @@ def test_from_numpy_sparse(self): t = Table.from_numpy(domain, sp.bsr_matrix(x)) self.assertTrue(sp.isspmatrix_csr(t.X)) + @staticmethod + def _new_table(attrs, classes, metas, s): + def nz(x): # pylint: disable=invalid-name + return x if x.size else np.empty((5, 0)) + + domain = Domain(attrs, classes, metas) + X = np.arange(s, s + len(attrs) * 5).reshape(5, -1) + Y = np.arange(100 + s, 100 + s + len(classes) * 5) + if len(classes) > 1: + Y = Y.reshape(5, -1) + M = np.arange(200 + s, 200 + s + len(metas) * 5).reshape(5, -1) + return Table.from_numpy(domain, nz(X), nz(Y), nz(M)) + + def test_concatenate_horizontal(self): + a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg") + + # Common case; one class, no empty's + tab1 = self._new_table((a, b), (c, ), (d, ), 0) + tab2 = self._new_table((e, ), (), (f, g), 1000) + joined = Table.concatenate((tab1, tab2), axis=1) + domain = joined.domain + self.assertEqual(domain.attributes, (a, b, e)) + self.assertEqual(domain.class_vars, (c, )) + self.assertEqual(domain.metas, (d, f, g)) + np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X))) + np.testing.assert_equal(joined.Y, tab1.Y) + np.testing.assert_equal(joined.metas, np.hstack((tab1.metas, tab2.metas))) + + # One part of one table is empty + tab1 = self._new_table((a, b), (), (), 0) + tab2 = self._new_table((), (), (c, ), 1000) + joined = Table.concatenate((tab1, tab2), axis=1) + domain = joined.domain + self.assertEqual(domain.attributes, (a, b)) + self.assertEqual(domain.class_vars, ()) + self.assertEqual(domain.metas, (c, )) + np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X))) + np.testing.assert_equal(joined.metas, np.hstack((tab1.metas, tab2.metas))) + + # Multiple classes, two empty parts are merged + tab1 = self._new_table((a, b), (c, ), (), 0) + tab2 = self._new_table((), (d, ), (), 1000) + joined = Table.concatenate((tab1, tab2), axis=1) + domain = joined.domain + self.assertEqual(domain.attributes, (a, b)) + self.assertEqual(domain.class_vars, (c, d)) + self.assertEqual(domain.metas, ()) + np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X))) + np.testing.assert_equal(joined.Y, np.vstack((tab1.Y, tab2.Y)).T) + + # Merging of attributes and selection of weights + tab1 = self._new_table((a, b), (c, ), (), 0) + tab1.attributes = dict(a=5, b=7) + tab2 = self._new_table((d, ), (e, ), (), 1000) + tab2.W = np.arange(5) + tab3 = self._new_table((f, g), (), (), 2000) + tab3.attributes = dict(a=1, c=4) + tab3.W = np.arange(5, 10) + joined = Table.concatenate((tab1, tab2, tab3), axis=1) + domain = joined.domain + self.assertEqual(domain.attributes, (a, b, d, f, g)) + self.assertEqual(domain.class_vars, (c, e)) + self.assertEqual(domain.metas, ()) + np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X, tab3.X))) + np.testing.assert_equal(joined.Y, np.vstack((tab1.Y, tab2.Y)).T) + self.assertEqual(joined.attributes, dict(a=5, b=7, c=4)) + np.testing.assert_equal(joined.ids, tab1.ids) + np.testing.assert_equal(joined.W, tab2.W) + + # Raise an exception when no tables are given + self.assertRaises(ValueError, Table.concatenate, (), axis=1) + + def test_concatenate_invalid_axis(self): + self.assertRaises(ValueError, Table.concatenate, (), axis=2) + + def test_with_column(self): + a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg") + col = np.arange(9, 14) + colr = col.reshape(5, -1) + tab = self._new_table((a, b, c), (d, ), (e, f), 0) + + # Add to attributes + tabw = tab.add_column(g, np.arange(9, 14)) + self.assertEqual(tabw.domain.attributes, (a, b, c, g)) + np.testing.assert_equal(tabw.X, np.hstack((tab.X, colr))) + np.testing.assert_equal(tabw.Y, tab.Y) + np.testing.assert_equal(tabw.metas, tab.metas) + + # Add to metas + tabw = tab.add_column(g, np.arange(9, 14), to_metas=True) + self.assertEqual(tabw.domain.metas, (e, f, g)) + np.testing.assert_equal(tabw.X, tab.X) + np.testing.assert_equal(tabw.Y, tab.Y) + np.testing.assert_equal(tabw.metas, np.hstack((tab.metas, colr))) + + # Add to empty attributes + tab = self._new_table((), (d, ), (e, f), 0) + tabw = tab.add_column(g, np.arange(9, 14)) + self.assertEqual(tabw.domain.attributes, (g, )) + np.testing.assert_equal(tabw.X, colr) + np.testing.assert_equal(tabw.Y, tab.Y) + np.testing.assert_equal(tabw.metas, tab.metas) + + # Add to empty metas + tab = self._new_table((a, b, c), (d, ), (), 0) + tabw = tab.add_column(g, np.arange(9, 14), to_metas=True) + self.assertEqual(tabw.domain.metas, (g, )) + np.testing.assert_equal(tabw.X, tab.X) + np.testing.assert_equal(tabw.Y, tab.Y) + np.testing.assert_equal(tabw.metas, colr) + + # Pass values as a list + tab = self._new_table((a, ), (d, ), (e, f), 0) + tabw = tab.add_column(g, [4, 2, -1, 2, 5]) + self.assertEqual(tabw.domain.attributes, (a, g)) + np.testing.assert_equal( + tabw.X, np.array([[0, 1, 2, 3, 4], [4, 2, -1, 2, 5]]).T) + + # Add non-primitives as metas; join `float` and `object` to `object` + tab = self._new_table((a, ), (d, ), (e, f), 0) + t = StringVariable("t") + tabw = tab.add_column(t, list("abcde")) + self.assertEqual(tabw.domain.attributes, (a, )) + self.assertEqual(tabw.domain.metas, (e, f, t)) + np.testing.assert_equal( + tabw.metas, + np.hstack((tab.metas, np.array(list("abcde")).reshape(5, -1)))) + class TestTableFilters(unittest.TestCase): def setUp(self): diff --git a/Orange/tests/test_table.py b/Orange/tests/test_table.py index e05606bd3e8..7714479f77f 100644 --- a/Orange/tests/test_table.py +++ b/Orange/tests/test_table.py @@ -541,7 +541,6 @@ def test_concatenate_exceptions(self): iris = data.Table("iris") self.assertRaises(ValueError, data.Table.concatenate, []) - self.assertRaises(ValueError, data.Table.concatenate, [zoo], axis=1) self.assertRaises(ValueError, data.Table.concatenate, [zoo, iris]) def test_concatenate_sparse(self): From 5429e1c18b0c05147bc2781fb28ec1b984d07817 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 19 Feb 2021 11:54:27 +0100 Subject: [PATCH 2/2] Table.concatenate: Refactor --- Orange/data/table.py | 52 ++++++++++++++++----------------- Orange/data/tests/test_table.py | 12 ++++++++ 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/Orange/data/table.py b/Orange/data/table.py index 059148a5c96..4ea833e9744 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -898,11 +898,11 @@ def __repr__(self): @classmethod def concatenate(cls, tables, axis=0): """ - Concatenate tables into a new table, either horizontally or vertically. + Concatenate tables into a new table, either vertically or horizontally. - If axis=0 (horizontal concatenate), all tables must have the same domain. + If axis=0 (vertical concatenate), all tables must have the same domain. - If axis=1 (vertical), + If axis=1 (horizontal), - all variable names must be unique. - ids are copied from the first table. - weights are copied from the first table in which they are defined. @@ -915,12 +915,28 @@ def concatenate(cls, tables, axis=0): Returns: table (Table) """ + if axis not in (0, 1): + raise ValueError("invalid axis") + if not tables: + raise ValueError('need at least one table to concatenate') + + if len(tables) == 1: + return tables[0].copy() + if axis == 0: - return cls._concatenate_vertical(tables) - elif axis == 1: - return cls._concatenate_horizontal(tables) + conc = cls._concatenate_vertical(tables) else: - raise ValueError("invalid axis") + conc = cls._concatenate_horizontal(tables) + + # TODO: Add attributes = {} to __init__ + conc.attributes = getattr(conc, "attributes", {}) + for table in reversed(tables): + conc.attributes.update(table.attributes) + + names = [table.name for table in tables if table.name != "untitled"] + if names: + conc.name = names[0] + return conc @classmethod def _concatenate_vertical(cls, tables): @@ -941,10 +957,6 @@ def merge1d(arrs): def collect(attr): return [getattr(arr, attr) for arr in tables] - if not tables: - raise ValueError('need at least one table to concatenate') - if len(tables) == 1: - return tables[0].copy() domain = tables[0].domain if any(table.domain != domain for table in tables): raise ValueError('concatenated tables must have the same domain') @@ -957,22 +969,12 @@ def collect(attr): merge1d(collect("W")) ) conc.ids = np.hstack([t.ids for t in tables]) - names = [table.name for table in tables if table.name != "untitled"] - if names: - conc.name = names[0] - # TODO: Add attributes = {} to __init__ - conc.attributes = getattr(conc, "attributes", {}) - for table in reversed(tables): - conc.attributes.update(table.attributes) return conc @classmethod def _concatenate_horizontal(cls, tables): """ """ - if not tables: - raise ValueError('need at least one table to join') - def all_of(objs, names): return (tuple(getattr(obj, name) for obj in objs) for name in names) @@ -983,7 +985,7 @@ def stack(arrs): if arr is not None and arr.size > 0) return np.hstack(non_empty) if non_empty else None - doms, Ws, table_attrss = all_of(tables, ("domain", "W", "attributes")) + doms, Ws = all_of(tables, ("domain", "W")) Xs, Ys, Ms = map(stack, all_of(tables, ("X", "Y", "metas"))) # pylint: disable=undefined-loop-variable for W in Ws: @@ -992,11 +994,7 @@ def stack(arrs): parts = all_of(doms, ("attributes", "class_vars", "metas")) domain = Domain(*(tuple(chain(*lst)) for lst in parts)) - table = cls.from_numpy(domain, Xs, Ys, Ms, W, ids=tables[0].ids) - for ta in reversed(table_attrss): - table.attributes.update(ta) - - return table + return cls.from_numpy(domain, Xs, Ys, Ms, W, ids=tables[0].ids) def add_column(self, variable, data, to_metas=None): """ diff --git a/Orange/data/tests/test_table.py b/Orange/data/tests/test_table.py index 0c4bfbb0ac1..836110276b3 100644 --- a/Orange/data/tests/test_table.py +++ b/Orange/data/tests/test_table.py @@ -183,6 +183,18 @@ def test_concatenate_horizontal(self): def test_concatenate_invalid_axis(self): self.assertRaises(ValueError, Table.concatenate, (), axis=2) + def test_concatenate_names(self): + a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg") + + tab1 = self._new_table((a, ), (c, ), (d, ), 0) + tab2 = self._new_table((e, ), (), (f, g), 1000) + tab3 = self._new_table((b, ), (), (), 1000) + tab2.name = "tab2" + tab3.name = "tab3" + + joined = Table.concatenate((tab1, tab2, tab3), axis=1) + self.assertEqual(joined.name, "tab2") + def test_with_column(self): a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg") col = np.arange(9, 14)