pandas_compat: fix table_from_frames for "normal" dataframe

biolab · Oct 18, 2021 · 7c447b1 · 7c447b1
1 parent 0013e5a
commit 7c447b1
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 110 deletions.
diff --git a/Orange/data/pandas_compat.py b/Orange/data/pandas_compat.py
@@ -146,6 +146,26 @@ def __finalize__(self, other, method=None, **_):
     pd.DataFrame.__finalize__ = __finalize__
 
 
+def _reset_index(df: pd.DataFrame) -> pd.DataFrame:
+    """If df index is not a simple RangeIndex (or similar), include it into a table"""
+    if (
+        # not range-like index - test first to skip slow startswith(_o) check
+        not (
+            df.index.is_integer()
+            and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
+        )
+        # check that it does not contain Orange index
+        and (
+            # startswith is slow (for long dfs) - firs check if col has strings
+            isinstance(df.index, pd.MultiIndex)
+            or not is_string_dtype(df.index)
+            or not any(str(i).startswith("_o") for i in df.index)
+        )
+    ):
+        df = df.reset_index()
+    return df
+
+
 def _is_discrete(s, force_nominal):
     return (is_categorical_dtype(s) or
             is_object_dtype(s) and (force_nominal or
@@ -207,136 +227,81 @@ def col_type(dt):
     ).values
 
 
-def vars_from_df(df, role=None, force_nominal=False):
-    if role is None and hasattr(df, 'orange_role'):
-        _role = df.orange_role
-    else:
-        _role = role
+def to_categorical(s, _):
+    x = s.astype("category").cat.codes
+    # it is same than x.replace(-1, np.nan), but much faster
+    x = x.where(x != -1, np.nan)
+    return np.asarray(x)
 
-    # If df index is not a simple RangeIndex (or similar), put it into data
-    if (
-        # not range-like index - test first to skip slow startswith(_o) check
-        not (
-            df.index.is_integer()
-            and (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing)
-        )
-        # check that it does not contain Orange index
-        and (
-            # startswith is slow (for long drs) - firs check if col has strings
-            isinstance(df.index, pd.MultiIndex)
-            or not is_string_dtype(df.index)
-            or not any(str(i).startswith("_o") for i in df.index)
-        )
-    ):
-        df = df.reset_index()
 
-    Xcols, Ycols, Mcols = [], [], []
-    Xexpr, Yexpr, Mexpr = [], [], []
-    attrs, class_vars, metas = [], [], []
+def vars_from_df(df, role=None, force_nominal=False):
+    if role is None and hasattr(df, 'orange_role'):
+        role = df.orange_role
+    df = _reset_index(df)
 
-    contains_strings = _role == Role.Meta
+    cols = [], [], []
+    exprs = [], [], []
+    vars_ = [], [], []
 
     for column in df.columns:
         s = df[column]
+        _role = Role.Attribute if role is None else role
         if hasattr(df, 'orange_variables') and column in df.orange_variables:
             original_var = df.orange_variables[column]
             var = original_var.copy(compute_value=None)
-            if _role == Role.Attribute:
-                Xcols.append(column)
-                Xexpr.append(None)
-                attrs.append(var)
-            elif _role == Role.ClassAttribute:
-                Ycols.append(column)
-                Yexpr.append(None)
-                class_vars.append(var)
-            else:  # if role == Role.Meta:
-                Mcols.append(column)
-                Mexpr.append(None)
-                metas.append(var)
+            expr = None
         elif _is_datetime(s):
             var = TimeVariable(str(column))
-            attrs.append(var)
-            Xcols.append(column)
-            Xexpr.append(_convert_datetime)
+            expr = _convert_datetime
         elif _is_discrete(s, force_nominal):
-            discrete = s.astype('category').cat
-            var = DiscreteVariable(str(column),
-                                   discrete.categories.astype(str).tolist())
-            attrs.append(var)
-            Xcols.append(column)
-
-            def to_cat(s, _):
-                x = s.astype("category").cat.codes
-                # it is same than x.replace(-1, np.nan), but much faster
-                x = x.where(x != -1, np.nan)
-                return np.asarray(x)
-
-            Xexpr.append(to_cat)
+            discrete = s.astype("category").cat
+            var = DiscreteVariable(
+                str(column), discrete.categories.astype(str).tolist()
+            )
+            expr = to_categorical
         elif is_numeric_dtype(s):
             var = ContinuousVariable(
                 # set number of decimals to 0 if int else keeps default behaviour
                 str(column), number_of_decimals=(0 if is_integer_dtype(s) else None)
             )
-            attrs.append(var)
-            Xcols.append(column)
-            Xexpr.append(None)
+            expr = None
         else:
-            contains_strings = True
+            if role is not None and role != Role.Meta:
+                raise ValueError("String variable must be in metas.")
+            _role = Role.Meta
             var = StringVariable(str(column))
-            metas.append(var)
-            Mcols.append(column)
-            Mexpr.append(lambda s, _: np.asarray(s, dtype=object))
-
-    # if role isn't explicitly set, try to
-    # export dataframes into one contiguous block.
-    # for this all columns must be of the same role
-    if isinstance(df, OrangeDataFrame) \
-            and not role \
-            and contains_strings \
-            and not force_nominal:
-        attrs.extend(class_vars)
-        attrs.extend(metas)
-        metas = attrs
-        Xcols.extend(Ycols)
-        Xcols.extend(Mcols)
-        Mcols = Xcols
-        Xexpr.extend(Yexpr)
-        Xexpr.extend(Mexpr)
-        Mexpr = Xexpr
-
-        attrs, class_vars = [], []
-        Xcols, Ycols = [], []
-        Xexpr, Yexpr = [], []
-
-    XYM = []
-    for Avars, Acols, Aexpr in zip(
-            (attrs, class_vars, metas),
-            (Xcols, Ycols, Mcols),
-            (Xexpr, Yexpr, Mexpr)):
-        if not Acols:
-            A = None if Acols != Xcols else np.empty((df.shape[0], 0))
-            XYM.append(A)
-            continue
-        if not any(Aexpr):
-            Adf = df if all(c in Acols
-                            for c in df.columns) else df[Acols]
-            if all(isinstance(a, SparseDtype) for a in Adf.dtypes):
-                A = csr_matrix(Adf.sparse.to_coo())
+            expr = lambda s, _: np.asarray(s, dtype=object)
+
+        cols[_role].append(column)
+        exprs[_role].append(expr)
+        vars_[_role].append(var)
+
+    xym = []
+    for a_vars, a_cols, a_expr in zip(vars_, cols, exprs):
+        if not a_cols:
+            arr = None if a_cols != cols[0] else np.empty((df.shape[0], 0))
+        elif not any(a_expr):
+            # if all c in columns table will share memory with dataframe
+            a_df = df if all(c in a_cols for c in df.columns) else df[a_cols]
+            if all(isinstance(a, SparseDtype) for a in a_df.dtypes):
+                arr = csr_matrix(a_df.sparse.to_coo())
             else:
-                A = np.asarray(Adf)
-            XYM.append(A)
-            continue
-        # we'll have to copy the table to resolve any expressions
-        # TODO eliminate expr (preprocessing for pandas -> table)
-        A = np.array([expr(df[col], var) if expr else np.asarray(df[col])
-                      for var, col, expr in zip(Avars, Acols, Aexpr)]).T
-        XYM.append(A)
+                arr = np.asarray(a_df)
+        else:
+            # we'll have to copy the table to resolve any expressions
+            arr = np.array(
+                [
+                    expr(df[col], var) if expr else np.asarray(df[col])
+                    for var, col, expr in zip(a_vars, a_cols, a_expr)
+                ]
+            ).T
+        xym.append(arr)
 
     # Let the tables share memory with pandas frame
-    if XYM[1] is not None and XYM[1].ndim == 2 and XYM[1].shape[1] == 1:
-        XYM[1] = XYM[1][:, 0]
+    if xym[1] is not None and xym[1].ndim == 2 and xym[1].shape[1] == 1:
+        xym[1] = xym[1][:, 0]
 
-    return XYM, Domain(attrs, class_vars, metas)
+    return xym, Domain(*vars_)
 
 
 def table_from_frame(df, *, force_nominal=False):
@@ -396,13 +361,12 @@ def table_from_frames(xdf, ydf, mdf):
     W = None
     for df in dfs:
         if isinstance(df, OrangeDataFrame):
-            W = [df.orange_weights[i] for i in df.index
-                 if i in df.orange_weights]
+            W = [df.orange_weights[i] for i in df.index if i in df.orange_weights]
             if len(W) != len(df.index):
                 W = None
+            attributes.update(df.orange_attributes)
         else:
             W = None
-        attributes.update(df.orange_attributes)
 
     return Table.from_numpy(
         domain,

diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py
@@ -428,6 +428,20 @@ def _get_orange_demo_datasets():
             self.assertEqual(len(df), len(table), assert_message)
             self.assertEqual(len(df.columns), len(table.domain.variables), assert_message)
 
+    def test_table_from_frames_not_orange_dataframe(self):
+        x = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["x1", "x2", "x3"])
+        y = pd.DataFrame([[5], [6]], columns=["y"])
+        m = pd.DataFrame([[1, 2], [4, 5]], columns=["m1", "m2"])
+        new_table = Table.from_pandas_dfs(x, y, m)
+
+        np.testing.assert_array_equal(x, new_table.X)
+        np.testing.assert_array_equal(y.values.flatten(), new_table.Y)
+        np.testing.assert_array_equal(m, new_table.metas)
+        d = new_table.domain
+        self.assertListEqual(x.columns.tolist(), [a.name for a in d.attributes])
+        self.assertEqual(y.columns[0], d.class_var.name)
+        self.assertListEqual(m.columns.tolist(), [a.name for a in d.metas])
+
 
 class TestTablePandas(unittest.TestCase):
     def setUp(self):