Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Table: Add methods 'join' and 'with_column' #5251

Merged
merged 2 commits into from
Feb 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 95 additions & 14 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,49 @@ def __repr__(self):

@classmethod
def concatenate(cls, tables, axis=0):
"""Concatenate tables into a new table"""
"""
Concatenate tables into a new table, either vertically or horizontally.

If axis=0 (vertical concatenate), all tables must have the same domain.

If axis=1 (horizontal),
- all variable names must be unique.
- ids are copied from the first table.
- weights are copied from the first table in which they are defined.
- the dictionary of table's attributes are merged. If the same attribute
appears in multiple dictionaries, the earlier are used.

Args:
tables (Table): tables to be joined

Returns:
table (Table)
"""
if axis not in (0, 1):
raise ValueError("invalid axis")
if not tables:
raise ValueError('need at least one table to concatenate')

if len(tables) == 1:
return tables[0].copy()

if axis == 0:
conc = cls._concatenate_vertical(tables)
else:
conc = cls._concatenate_horizontal(tables)

# TODO: Add attributes = {} to __init__
conc.attributes = getattr(conc, "attributes", {})
for table in reversed(tables):
conc.attributes.update(table.attributes)

names = [table.name for table in tables if table.name != "untitled"]
if names:
conc.name = names[0]
return conc

@classmethod
def _concatenate_vertical(cls, tables):
def vstack(arrs):
return [np, sp][any(sp.issparse(arr) for arr in arrs)].vstack(arrs)

Expand All @@ -915,12 +957,6 @@ def merge1d(arrs):
def collect(attr):
return [getattr(arr, attr) for arr in tables]

if axis == 1:
raise ValueError("concatenate no longer supports axis 1")
if not tables:
raise ValueError('need at least one table to concatenate')
if len(tables) == 1:
return tables[0].copy()
domain = tables[0].domain
if any(table.domain != domain for table in tables):
raise ValueError('concatenated tables must have the same domain')
Expand All @@ -933,15 +969,60 @@ def collect(attr):
merge1d(collect("W"))
)
conc.ids = np.hstack([t.ids for t in tables])
names = [table.name for table in tables if table.name != "untitled"]
if names:
conc.name = names[0]
# TODO: Add attributes = {} to __init__
conc.attributes = getattr(conc, "attributes", {})
for table in reversed(tables):
conc.attributes.update(table.attributes)
return conc

@classmethod
def _concatenate_horizontal(cls, tables):
"""
"""
def all_of(objs, names):
return (tuple(getattr(obj, name) for obj in objs)
for name in names)

def stack(arrs):
non_empty = tuple(arr if arr.ndim == 2 else arr[:, np.newaxis]
for arr in arrs
if arr is not None and arr.size > 0)
return np.hstack(non_empty) if non_empty else None

doms, Ws = all_of(tables, ("domain", "W"))
Xs, Ys, Ms = map(stack, all_of(tables, ("X", "Y", "metas")))
# pylint: disable=undefined-loop-variable
for W in Ws:
if W.size:
break

parts = all_of(doms, ("attributes", "class_vars", "metas"))
domain = Domain(*(tuple(chain(*lst)) for lst in parts))
return cls.from_numpy(domain, Xs, Ys, Ms, W, ids=tables[0].ids)

def add_column(self, variable, data, to_metas=None):
"""
Create a new table with an additional column

Column's name must be unique

Args:
variable (Variable): variable for the new column
data (np.ndarray): data for the new column
to_metas (bool, optional): if `True` the column is added as meta
column. Otherwise, primitive variables are added to attributes
and non-primitive to metas.

Returns:
table (Table): a new table with the additional column
"""
dom = self.domain
attrs, classes, metas = dom.attributes, dom.class_vars, dom.metas
if to_metas or not variable.is_primitive():
metas += (variable, )
else:
attrs += (variable, )
domain = Domain(attrs, classes, metas)
new_table = self.transform(domain)
new_table.get_column_view(variable)[0][:] = data
return new_table

def is_view(self):
"""
Return `True` if all arrays represent a view referring to another table
Expand Down
140 changes: 140 additions & 0 deletions Orange/data/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,146 @@ def test_from_numpy_sparse(self):
t = Table.from_numpy(domain, sp.bsr_matrix(x))
self.assertTrue(sp.isspmatrix_csr(t.X))

@staticmethod
def _new_table(attrs, classes, metas, s):
def nz(x): # pylint: disable=invalid-name
return x if x.size else np.empty((5, 0))

domain = Domain(attrs, classes, metas)
X = np.arange(s, s + len(attrs) * 5).reshape(5, -1)
Y = np.arange(100 + s, 100 + s + len(classes) * 5)
if len(classes) > 1:
Y = Y.reshape(5, -1)
M = np.arange(200 + s, 200 + s + len(metas) * 5).reshape(5, -1)
return Table.from_numpy(domain, nz(X), nz(Y), nz(M))

def test_concatenate_horizontal(self):
a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg")

# Common case; one class, no empty's
tab1 = self._new_table((a, b), (c, ), (d, ), 0)
tab2 = self._new_table((e, ), (), (f, g), 1000)
joined = Table.concatenate((tab1, tab2), axis=1)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b, e))
self.assertEqual(domain.class_vars, (c, ))
self.assertEqual(domain.metas, (d, f, g))
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X)))
np.testing.assert_equal(joined.Y, tab1.Y)
np.testing.assert_equal(joined.metas, np.hstack((tab1.metas, tab2.metas)))

# One part of one table is empty
tab1 = self._new_table((a, b), (), (), 0)
tab2 = self._new_table((), (), (c, ), 1000)
joined = Table.concatenate((tab1, tab2), axis=1)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b))
self.assertEqual(domain.class_vars, ())
self.assertEqual(domain.metas, (c, ))
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X)))
np.testing.assert_equal(joined.metas, np.hstack((tab1.metas, tab2.metas)))

# Multiple classes, two empty parts are merged
tab1 = self._new_table((a, b), (c, ), (), 0)
tab2 = self._new_table((), (d, ), (), 1000)
joined = Table.concatenate((tab1, tab2), axis=1)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b))
self.assertEqual(domain.class_vars, (c, d))
self.assertEqual(domain.metas, ())
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X)))
np.testing.assert_equal(joined.Y, np.vstack((tab1.Y, tab2.Y)).T)

# Merging of attributes and selection of weights
tab1 = self._new_table((a, b), (c, ), (), 0)
tab1.attributes = dict(a=5, b=7)
tab2 = self._new_table((d, ), (e, ), (), 1000)
tab2.W = np.arange(5)
tab3 = self._new_table((f, g), (), (), 2000)
tab3.attributes = dict(a=1, c=4)
tab3.W = np.arange(5, 10)
joined = Table.concatenate((tab1, tab2, tab3), axis=1)
domain = joined.domain
self.assertEqual(domain.attributes, (a, b, d, f, g))
self.assertEqual(domain.class_vars, (c, e))
self.assertEqual(domain.metas, ())
np.testing.assert_equal(joined.X, np.hstack((tab1.X, tab2.X, tab3.X)))
np.testing.assert_equal(joined.Y, np.vstack((tab1.Y, tab2.Y)).T)
self.assertEqual(joined.attributes, dict(a=5, b=7, c=4))
np.testing.assert_equal(joined.ids, tab1.ids)
np.testing.assert_equal(joined.W, tab2.W)

# Raise an exception when no tables are given
self.assertRaises(ValueError, Table.concatenate, (), axis=1)

def test_concatenate_invalid_axis(self):
self.assertRaises(ValueError, Table.concatenate, (), axis=2)

def test_concatenate_names(self):
a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg")

tab1 = self._new_table((a, ), (c, ), (d, ), 0)
tab2 = self._new_table((e, ), (), (f, g), 1000)
tab3 = self._new_table((b, ), (), (), 1000)
tab2.name = "tab2"
tab3.name = "tab3"

joined = Table.concatenate((tab1, tab2, tab3), axis=1)
self.assertEqual(joined.name, "tab2")

def test_with_column(self):
a, b, c, d, e, f, g = map(ContinuousVariable, "abcdefg")
col = np.arange(9, 14)
colr = col.reshape(5, -1)
tab = self._new_table((a, b, c), (d, ), (e, f), 0)

# Add to attributes
tabw = tab.add_column(g, np.arange(9, 14))
self.assertEqual(tabw.domain.attributes, (a, b, c, g))
np.testing.assert_equal(tabw.X, np.hstack((tab.X, colr)))
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, tab.metas)

# Add to metas
tabw = tab.add_column(g, np.arange(9, 14), to_metas=True)
self.assertEqual(tabw.domain.metas, (e, f, g))
np.testing.assert_equal(tabw.X, tab.X)
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, np.hstack((tab.metas, colr)))

# Add to empty attributes
tab = self._new_table((), (d, ), (e, f), 0)
tabw = tab.add_column(g, np.arange(9, 14))
self.assertEqual(tabw.domain.attributes, (g, ))
np.testing.assert_equal(tabw.X, colr)
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, tab.metas)

# Add to empty metas
tab = self._new_table((a, b, c), (d, ), (), 0)
tabw = tab.add_column(g, np.arange(9, 14), to_metas=True)
self.assertEqual(tabw.domain.metas, (g, ))
np.testing.assert_equal(tabw.X, tab.X)
np.testing.assert_equal(tabw.Y, tab.Y)
np.testing.assert_equal(tabw.metas, colr)

# Pass values as a list
tab = self._new_table((a, ), (d, ), (e, f), 0)
tabw = tab.add_column(g, [4, 2, -1, 2, 5])
self.assertEqual(tabw.domain.attributes, (a, g))
np.testing.assert_equal(
tabw.X, np.array([[0, 1, 2, 3, 4], [4, 2, -1, 2, 5]]).T)

# Add non-primitives as metas; join `float` and `object` to `object`
tab = self._new_table((a, ), (d, ), (e, f), 0)
t = StringVariable("t")
tabw = tab.add_column(t, list("abcde"))
self.assertEqual(tabw.domain.attributes, (a, ))
self.assertEqual(tabw.domain.metas, (e, f, t))
np.testing.assert_equal(
tabw.metas,
np.hstack((tab.metas, np.array(list("abcde")).reshape(5, -1))))


class TestTableFilters(unittest.TestCase):
def setUp(self):
Expand Down
1 change: 0 additions & 1 deletion Orange/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,6 @@ def test_concatenate_exceptions(self):
iris = data.Table("iris")

self.assertRaises(ValueError, data.Table.concatenate, [])
self.assertRaises(ValueError, data.Table.concatenate, [zoo], axis=1)
self.assertRaises(ValueError, data.Table.concatenate, [zoo, iris])

def test_concatenate_sparse(self):
Expand Down