Skip to content

Commit

Permalink
table_from_frame: replace nan with String.Unknown for string variable
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Jan 19, 2022
1 parent c860359 commit fefdb46
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 6 deletions.
6 changes: 5 additions & 1 deletion Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,11 @@ def vars_from_df(df, role=None, force_nominal=False):
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
expr = lambda s, _: np.asarray(s, dtype=object)
expr = lambda s, _: np.asarray(
# replace nan with object that Orange use for unknown and
# assure that all values are string
s.fillna(StringVariable.Unknown).astype(str), dtype=object
)

cols[_role].append(column)
exprs[_role].append(expr)
Expand Down
37 changes: 32 additions & 5 deletions Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ def test_table_from_frame(self):
[0, pd.Timestamp('1724-12-20').timestamp()],
[0, pd.Timestamp('1724-12-20').timestamp()],
[nan, nan]])
np.testing.assert_equal(table.metas.tolist(), [['a'],
['b'],
['c'],
[nan]])
np.testing.assert_equal(
table.metas.tolist(), [["a"], ["b"], ["c"], [StringVariable.Unknown]]
)
names = [var.name for var in table.domain.attributes]
types = [type(var) for var in table.domain.attributes]
self.assertEqual(names, ['1', '2'])
Expand Down Expand Up @@ -383,7 +382,7 @@ def test_table_from_frame_timezones(self):
],
)

def test_table_from_frame_no_datetim(self):
def test_table_from_frame_no_datetime(self):
"""
In case when dtype of column is object and column contains numbers only,
column could be recognized as a TimeVarialbe since pd.to_datetime can parse
Expand All @@ -402,6 +401,34 @@ def test_table_from_frame_no_datetim(self):
# check if exactly ContinuousVariable and not subtype TimeVariable
self.assertIsInstance(table.domain.attributes[0], DiscreteVariable)

def testa_table_from_frame_string(self):
"""
Test if string-like variables are handled correctly and nans are replaced
with String.Unknown
"""
from Orange.data.pandas_compat import table_from_frame

# s1 contains nan and s2 contains pd.Na
df = pd.DataFrame(
[["a", "b"], ["c", "d"], ["e", "f"], [5, "c"], [np.nan, np.nan]],
columns=["s1", "s2"],
).astype({"s1": "object", "s2": "string"})
table = table_from_frame(df)
np.testing.assert_array_equal(np.empty((5, 0)), table.X)
np.testing.assert_array_equal(
np.array(
[
["a", "b"],
["c", "d"],
["e", "f"],
["5", "c"],
[StringVariable.Unknown, StringVariable.Unknown],
]
),
table.metas,
)
self.assertTrue(all(isinstance(v, StringVariable) for v in table.domain.metas))

def test_time_variable_compatible(self):
from Orange.data.pandas_compat import table_from_frame

Expand Down

0 comments on commit fefdb46

Please sign in to comment.