Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] CSV Import: guess data types #4838

Merged
merged 1 commit into from
Jun 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 84 additions & 1 deletion Orange/widgets/data/owcsvimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,6 +1259,8 @@ def expand(ranges):
float_precision="round_trip",
**numbers_format_kwds
)
df = guess_types(df, dtypes, columns_ignored)

if columns_ignored:
# TODO: use 'usecols' parameter in `read_csv` call to
# avoid loading/parsing the columns
Expand All @@ -1270,6 +1272,86 @@ def expand(ranges):
return df


def guess_types(
df: pd.DataFrame, dtypes: Dict[int, str], columns_ignored: List[int]
) -> pd.DataFrame:
"""
Guess data type for variables according to values.

Parameters
----------
df
Data frame
dtypes
The dictionary with data types set by user. We will guess values only
for columns that does not have data type defined.
columns_ignored
List with indices of ignored columns. Ignored columns are skipped.

Returns
-------
A data frame with changed dtypes according to the strategy.
"""
for i, col in enumerate(df):
# only when automatic is set in widget dialog
if dtypes.get(i, None) is None and i not in columns_ignored:
df[col] = guess_data_type(df[col])
return df


def guess_data_type(col: pd.Series) -> pd.Series:
"""
Guess column types. Logic is same than in guess_data_type from io_utils
module. This function only change the dtype of the column such that later
correct Orange.data.variable is used.
Logic:
- if can converted to date-time (ISO) -> TimeVariable
- if numeric (only numbers)
- only values {0, 1} or {1, 2} -> DiscreteVariable
- else -> ContinuousVariable
- if not numbers:
- num_unique_values < len(data) ** 0.7 and < 100 -> DiscreteVariable
- else -> StringVariable

Parameters
----------
col
Data column

Returns
-------
Data column with correct dtype
"""
def parse_dates(s):
"""
This is an extremely fast approach to datetime parsing.
For large data, the same dates are often repeated. Rather than
re-parse these, we store all unique dates, parse them, and
use a lookup to convert all dates.
"""
try:
dates = {date: pd.to_datetime(date) for date in s.unique()}
except ValueError:
return None
return s.map(dates)

if pdtypes.is_numeric_dtype(col):
unique_values = col.unique()
if len(unique_values) <= 2 and (
len(np.setdiff1d(unique_values, [0, 1])) == 0
or len(np.setdiff1d(unique_values, [1, 2])) == 0):
return col.astype("category")
else: # object
# try parse as date - if None not a date
parsed_col = parse_dates(col)
if parsed_col is not None:
return parsed_col
unique_values = col.unique()
if len(unique_values) < 100 and len(unique_values) < len(col)**0.7:
return col.astype("category")
return col


def clear_stack_on_cancel(f):
"""
A decorator that catches the TaskState.UserCancelException exception
Expand Down Expand Up @@ -1465,7 +1547,8 @@ def pandas_to_table(df):
)
# Remap the coldata into the var.values order/set
coldata = pd.Categorical(
coldata, categories=var.values, ordered=coldata.ordered
coldata.astype("str"), categories=var.values,
ordered=coldata.ordered,
)
codes = coldata.codes
assert np.issubdtype(codes.dtype, np.integer)
Expand Down
6 changes: 6 additions & 0 deletions Orange/widgets/data/tests/data-csv-types.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
time numeric1 discrete1 numeric2 discrete2 string
2020-05-05 1 0 a a
2020-05-06 2 1 a b
2020-05-07 3 0 a c
2020-05-08 4 1 b d
2020-05-09 5 1 b e
37 changes: 37 additions & 0 deletions Orange/widgets/data/tests/test_owcsvimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

from AnyQt.QtCore import QSettings

from Orange.data import DiscreteVariable, TimeVariable, ContinuousVariable, \
StringVariable
from Orange.tests import named_file
from Orange.widgets.tests.base import WidgetTest, GuiTest
from Orange.widgets.data import owcsvimport
Expand Down Expand Up @@ -127,6 +129,37 @@ def test_summary(self):
output_sum.assert_called_with(len(output),
format_summary_details(output))

data_csv_types_options = owcsvimport.Options(
encoding="ascii", dialect=csv.excel_tab(),
columntypes=[
(range(0, 5), ColumnType.Auto),
]
)

def test_type_guessing(self):
""" Check if correct column type is guessed when column type auto """
dirname = os.path.dirname(__file__)
path = os.path.join(dirname, "data-csv-types.tab")
widget = self.create_widget(
owcsvimport.OWCSVFileImport,
stored_settings={
"_session_items": [
(path, self.data_csv_types_options.as_dict())
]
}
)
widget.commit()
self.wait_until_finished(widget)
output = self.get_output("Data", widget)
domain = output.domain

self.assertIsInstance(domain["time"], TimeVariable)
self.assertIsInstance(domain["discrete1"], DiscreteVariable)
self.assertIsInstance(domain["discrete2"], DiscreteVariable)
self.assertIsInstance(domain["numeric1"], ContinuousVariable)
self.assertIsInstance(domain["numeric2"], ContinuousVariable)
self.assertIsInstance(domain["string"], StringVariable)


class TestImportDialog(GuiTest):
def test_dialog(self):
Expand Down Expand Up @@ -253,3 +286,7 @@ class dialect(csv.excel):
assert_array_equal(tb.X[:, 0], [np.nan, 0, np.nan])
assert_array_equal(tb.X[:, 1], [0, np.nan, np.nan])
assert_array_equal(tb.X[:, 2], [np.nan, 1, np.nan])


if __name__ == "__main__":
unittest.main()