Skip to content

Commit

Permalink
pandas_compat: fix table_from_frames for "normal" dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Oct 18, 2021
1 parent 0013e5a commit 7c447b1
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 110 deletions.
184 changes: 74 additions & 110 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,26 @@ def __finalize__(self, other, method=None, **_):
pd.DataFrame.__finalize__ = __finalize__


def _reset_index(df: pd.DataFrame) -> pd.DataFrame:
"""If df index is not a simple RangeIndex (or similar), include it into a table"""
if (
# not range-like index - test first to skip slow startswith(_o) check
not (
df.index.is_integer()
and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
)
# check that it does not contain Orange index
and (
# startswith is slow (for long dfs) - firs check if col has strings
isinstance(df.index, pd.MultiIndex)
or not is_string_dtype(df.index)
or not any(str(i).startswith("_o") for i in df.index)
)
):
df = df.reset_index()
return df


def _is_discrete(s, force_nominal):
return (is_categorical_dtype(s) or
is_object_dtype(s) and (force_nominal or
Expand Down Expand Up @@ -207,136 +227,81 @@ def col_type(dt):
).values


def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
_role = df.orange_role
else:
_role = role
def to_categorical(s, _):
x = s.astype("category").cat.codes
# it is same than x.replace(-1, np.nan), but much faster
x = x.where(x != -1, np.nan)
return np.asarray(x)

# If df index is not a simple RangeIndex (or similar), put it into data
if (
# not range-like index - test first to skip slow startswith(_o) check
not (
df.index.is_integer()
and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
)
# check that it does not contain Orange index
and (
# startswith is slow (for long drs) - firs check if col has strings
isinstance(df.index, pd.MultiIndex)
or not is_string_dtype(df.index)
or not any(str(i).startswith("_o") for i in df.index)
)
):
df = df.reset_index()

Xcols, Ycols, Mcols = [], [], []
Xexpr, Yexpr, Mexpr = [], [], []
attrs, class_vars, metas = [], [], []
def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
role = df.orange_role
df = _reset_index(df)

contains_strings = _role == Role.Meta
cols = [], [], []
exprs = [], [], []
vars_ = [], [], []

for column in df.columns:
s = df[column]
_role = Role.Attribute if role is None else role
if hasattr(df, 'orange_variables') and column in df.orange_variables:
original_var = df.orange_variables[column]
var = original_var.copy(compute_value=None)
if _role == Role.Attribute:
Xcols.append(column)
Xexpr.append(None)
attrs.append(var)
elif _role == Role.ClassAttribute:
Ycols.append(column)
Yexpr.append(None)
class_vars.append(var)
else: # if role == Role.Meta:
Mcols.append(column)
Mexpr.append(None)
metas.append(var)
expr = None
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(_convert_datetime)
expr = _convert_datetime
elif _is_discrete(s, force_nominal):
discrete = s.astype('category').cat
var = DiscreteVariable(str(column),
discrete.categories.astype(str).tolist())
attrs.append(var)
Xcols.append(column)

def to_cat(s, _):
x = s.astype("category").cat.codes
# it is same than x.replace(-1, np.nan), but much faster
x = x.where(x != -1, np.nan)
return np.asarray(x)

Xexpr.append(to_cat)
discrete = s.astype("category").cat
var = DiscreteVariable(
str(column), discrete.categories.astype(str).tolist()
)
expr = to_categorical
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
)
attrs.append(var)
Xcols.append(column)
Xexpr.append(None)
expr = None
else:
contains_strings = True
if role is not None and role != Role.Meta:
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
metas.append(var)
Mcols.append(column)
Mexpr.append(lambda s, _: np.asarray(s, dtype=object))

# if role isn't explicitly set, try to
# export dataframes into one contiguous block.
# for this all columns must be of the same role
if isinstance(df, OrangeDataFrame) \
and not role \
and contains_strings \
and not force_nominal:
attrs.extend(class_vars)
attrs.extend(metas)
metas = attrs
Xcols.extend(Ycols)
Xcols.extend(Mcols)
Mcols = Xcols
Xexpr.extend(Yexpr)
Xexpr.extend(Mexpr)
Mexpr = Xexpr

attrs, class_vars = [], []
Xcols, Ycols = [], []
Xexpr, Yexpr = [], []

XYM = []
for Avars, Acols, Aexpr in zip(
(attrs, class_vars, metas),
(Xcols, Ycols, Mcols),
(Xexpr, Yexpr, Mexpr)):
if not Acols:
A = None if Acols != Xcols else np.empty((df.shape[0], 0))
XYM.append(A)
continue
if not any(Aexpr):
Adf = df if all(c in Acols
for c in df.columns) else df[Acols]
if all(isinstance(a, SparseDtype) for a in Adf.dtypes):
A = csr_matrix(Adf.sparse.to_coo())
expr = lambda s, _: np.asarray(s, dtype=object)

cols[_role].append(column)
exprs[_role].append(expr)
vars_[_role].append(var)

xym = []
for a_vars, a_cols, a_expr in zip(vars_, cols, exprs):
if not a_cols:
arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0))
elif not any(a_expr):
# if all c in columns table will share memory with dataframe
a_df = df if all(c in a_cols for c in df.columns) else df[a_cols]
if all(isinstance(a, SparseDtype) for a in a_df.dtypes):
arr = csr_matrix(a_df.sparse.to_coo())
else:
A = np.asarray(Adf)
XYM.append(A)
continue
# we'll have to copy the table to resolve any expressions
# TODO eliminate expr (preprocessing for pandas -> table)
A = np.array([expr(df[col], var) if expr else np.asarray(df[col])
for var, col, expr in zip(Avars, Acols, Aexpr)]).T
XYM.append(A)
arr = np.asarray(a_df)
else:
# we'll have to copy the table to resolve any expressions
arr = np.array(
[
expr(df[col], var) if expr else np.asarray(df[col])
for var, col, expr in zip(a_vars, a_cols, a_expr)
]
).T
xym.append(arr)

# Let the tables share memory with pandas frame
if XYM[1] is not None and XYM[1].ndim == 2 and XYM[1].shape[1] == 1:
XYM[1] = XYM[1][:, 0]
if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1:
xym[1] = xym[1][:, 0]

return XYM, Domain(attrs, class_vars, metas)
return xym, Domain(*vars_)


def table_from_frame(df, *, force_nominal=False):
Expand Down Expand Up @@ -396,13 +361,12 @@ def table_from_frames(xdf, ydf, mdf):
W = None
for df in dfs:
if isinstance(df, OrangeDataFrame):
W = [df.orange_weights[i] for i in df.index
if i in df.orange_weights]
W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
if len(W) != len(df.index):
W = None
attributes.update(df.orange_attributes)
else:
W = None
attributes.update(df.orange_attributes)

return Table.from_numpy(
domain,
Expand Down
14 changes: 14 additions & 0 deletions Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,20 @@ def _get_orange_demo_datasets():
self.assertEqual(len(df), len(table), assert_message)
self.assertEqual(len(df.columns), len(table.domain.variables), assert_message)

def test_table_from_frames_not_orange_dataframe(self):
x = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["x1", "x2", "x3"])
y = pd.DataFrame([[5], [6]], columns=["y"])
m = pd.DataFrame([[1, 2], [4, 5]], columns=["m1", "m2"])
new_table = Table.from_pandas_dfs(x, y, m)

np.testing.assert_array_equal(x, new_table.X)
np.testing.assert_array_equal(y.values.flatten(), new_table.Y)
np.testing.assert_array_equal(m, new_table.metas)
d = new_table.domain
self.assertListEqual(x.columns.tolist(), [a.name for a in d.attributes])
self.assertEqual(y.columns[0], d.class_var.name)
self.assertListEqual(m.columns.tolist(), [a.name for a in d.metas])


class TestTablePandas(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit 7c447b1

Please sign in to comment.