Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSV file Import: Backward compatibility #4876

Merged
merged 1 commit into from
Jul 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions Orange/widgets/data/owcsvimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import typing
from typing import (
List, Tuple, Dict, Optional, Any, Callable, Iterable, Hashable,
Union, AnyStr, BinaryIO
Union, AnyStr, BinaryIO, Set
)

from PyQt5.QtCore import (
Expand Down Expand Up @@ -486,6 +486,13 @@ class Error(widget.OWWidget.Error):
"directory": "",
"filter": ""
}) # type: Dict[str, str]

# we added column type guessing to this widget, which breaks compatibility
# with older saved workflows, where types not guessed differently, when
# compatibility_mode=True widget have older guessing behaviour
settings_version = 2
compatibility_mode = settings.Setting(False, schema_only=True)

MaxHistorySize = 50

want_main_area = False
Expand Down Expand Up @@ -844,7 +851,7 @@ def progress_(i, j):

task.future = self.__executor.submit(
clear_stack_on_cancel(load_csv),
path, opts, progress_,
path, opts, progress_, self.compatibility_mode
)
task.watcher.setFuture(task.future)
w = task.watcher
Expand Down Expand Up @@ -1043,6 +1050,11 @@ def _restoreState(self):
if idx != -1:
self.recent_combo.setCurrentIndex(idx)

@classmethod
def migrate_settings(cls, settings, version):
if not version or version < 2:
settings["compatibility_mode"] = True


@singledispatch
def sniff_csv(file, samplesize=2 ** 20):
Expand Down Expand Up @@ -1160,8 +1172,8 @@ def _mime_type_for_path(path):
}


def load_csv(path, opts, progress_callback=None):
# type: (Union[AnyStr, BinaryIO], Options, ...) -> pd.DataFrame
def load_csv(path, opts, progress_callback=None, compatibility_mode=False):
# type: (Union[AnyStr, BinaryIO], Options, ..., bool) -> pd.DataFrame
def dtype(coltype):
# type: (ColumnType) -> Optional[str]
if coltype == ColumnType.Numeric:
Expand Down Expand Up @@ -1256,7 +1268,10 @@ def expand(ranges):
float_precision="round_trip",
**numbers_format_kwds
)
df = guess_types(df, dtypes, columns_ignored)

# for older workflows avoid guessing type guessing
if not compatibility_mode:
df = guess_types(df, dtypes, columns_ignored)

if columns_ignored:
# TODO: use 'usecols' parameter in `read_csv` call to
Expand All @@ -1270,7 +1285,7 @@ def expand(ranges):


def guess_types(
df: pd.DataFrame, dtypes: Dict[int, str], columns_ignored: List[int]
df: pd.DataFrame, dtypes: Dict[int, str], columns_ignored: Set[int]
) -> pd.DataFrame:
"""
Guess data type for variables according to values.
Expand Down
36 changes: 33 additions & 3 deletions Orange/widgets/data/tests/test_owcsvimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ def test_type_guessing(self):
stored_settings={
"_session_items": [
(path, self.data_csv_types_options.as_dict())
]
],
"__version__": 2 # guessing works for versions >= 2
}
)
widget.commit()
Expand All @@ -160,9 +161,37 @@ def test_type_guessing(self):
self.assertIsInstance(domain["numeric2"], ContinuousVariable)
self.assertIsInstance(domain["string"], StringVariable)

def test_backward_compatibility(self):
"""
Check that widget have old behaviour on workflows with version < 2
"""
dirname = os.path.dirname(__file__)
path = os.path.join(dirname, "data-csv-types.tab")
widget = self.create_widget(
owcsvimport.OWCSVFileImport,
stored_settings={
"_session_items": [
(path, self.data_csv_types_options.as_dict())
],
"__version__": 1 # guessing works for versions >= 2
}
)
widget.commit()
self.wait_until_finished(widget)
output = self.get_output("Data", widget)
domain = output.domain

self.assertIsInstance(domain["time"], StringVariable)
self.assertIsInstance(domain["discrete1"], ContinuousVariable)
self.assertIsInstance(domain["discrete2"], StringVariable)
self.assertIsInstance(domain["numeric1"], ContinuousVariable)
self.assertIsInstance(domain["numeric2"], ContinuousVariable)
self.assertIsInstance(domain["string"], StringVariable)


class TestImportDialog(GuiTest):
def test_dialog(self):
@staticmethod
def test_dialog():
dirname = os.path.dirname(__file__)
path = os.path.join(dirname, "grep_file.txt")
d = owcsvimport.CSVImportDialog()
Expand Down Expand Up @@ -241,7 +270,8 @@ def test_load_csv(self):
list(df.iloc[:, 1]), ["one", "three"]
)

def test_convert(self):
@staticmethod
def test_convert():
contents = (
b'I, J, K\n'
b' , A, \n'
Expand Down