Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] table_from_frame: replace nan with String.Unknown for string variable #5795

Merged
merged 1 commit into from
Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,12 @@ def vars_from_df(df, role=None, force_nominal=False):
raise ValueError("String variable must be in metas.")
_role = Role.Meta
var = StringVariable(str(column))
expr = lambda s, _: np.asarray(s, dtype=object)
expr = lambda s, _: np.asarray(
# to object so that fillna can replace with nans if Unknown in nan
# replace nan with object Unknown assure that all values are string
s.astype(object).fillna(StringVariable.Unknown).astype(str),
dtype=object
)

cols[_role].append(column)
exprs[_role].append(expr)
Expand Down
44 changes: 35 additions & 9 deletions Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ def test_table_from_frame(self):
[0, pd.Timestamp('1724-12-20').timestamp()],
[0, pd.Timestamp('1724-12-20').timestamp()],
[nan, nan]])
np.testing.assert_equal(table.metas.tolist(), [['a'],
['b'],
['c'],
[nan]])
np.testing.assert_equal(
table.metas.tolist(), [["a"], ["b"], ["c"], [StringVariable.Unknown]]
)
names = [var.name for var in table.domain.attributes]
types = [type(var) for var in table.domain.attributes]
self.assertEqual(names, ['1', '2'])
Expand All @@ -63,10 +62,9 @@ def test_table_from_frame(self):
[1, 0, pd.Timestamp('1724-12-20').timestamp()],
[0, 0, pd.Timestamp('1724-12-20').timestamp()],
[0, nan, nan]])
np.testing.assert_equal(table.metas.tolist(), [['a'],
['b'],
['c'],
[nan]])
np.testing.assert_equal(
table.metas.tolist(), [["a"], ["b"], ["c"], [StringVariable.Unknown]]
)
names = [var.name for var in table.domain.attributes]
types = [type(var) for var in table.domain.attributes]
self.assertEqual(names, ['index', '1', '2'])
Expand Down Expand Up @@ -383,7 +381,7 @@ def test_table_from_frame_timezones(self):
],
)

def test_table_from_frame_no_datetim(self):
def test_table_from_frame_no_datetime(self):
"""
In case when dtype of column is object and column contains numbers only,
column could be recognized as a TimeVarialbe since pd.to_datetime can parse
Expand All @@ -402,6 +400,34 @@ def test_table_from_frame_no_datetim(self):
# check if exactly ContinuousVariable and not subtype TimeVariable
self.assertIsInstance(table.domain.attributes[0], DiscreteVariable)

def testa_table_from_frame_string(self):
"""
Test if string-like variables are handled correctly and nans are replaced
with String.Unknown
"""
from Orange.data.pandas_compat import table_from_frame

# s1 contains nan and s2 contains pd.Na
df = pd.DataFrame(
[["a", "b"], ["c", "d"], ["e", "f"], [5, "c"], [np.nan, np.nan]],
columns=["s1", "s2"],
).astype({"s1": "object", "s2": "string"})
table = table_from_frame(df)
np.testing.assert_array_equal(np.empty((5, 0)), table.X)
np.testing.assert_array_equal(
np.array(
[
["a", "b"],
["c", "d"],
["e", "f"],
["5", "c"],
[StringVariable.Unknown, StringVariable.Unknown],
]
),
table.metas,
)
self.assertTrue(all(isinstance(v, StringVariable) for v in table.domain.metas))

def test_time_variable_compatible(self):
from Orange.data.pandas_compat import table_from_frame

Expand Down