diff --git a/Orange/data/tests/test_util.py b/Orange/data/tests/test_util.py index 2df6c287be9..f594abffaea 100644 --- a/Orange/data/tests/test_util.py +++ b/Orange/data/tests/test_util.py @@ -59,10 +59,39 @@ def test_get_unique_names_from_duplicates(self): ["x (2)", "x (3)", "x (1)"]) self.assertEqual( get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]), - ["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"]) + ["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"]) + self.assertEqual( + get_unique_names_duplicates(["iris", "iris", "iris (1)"]), + ["iris (2)", "iris (3)", "iris (1)"]) + + self.assertEqual( + get_unique_names_duplicates(["foo", "bar", "baz"], return_duplicated=True), + (["foo", "bar", "baz"], [])) + self.assertEqual( + get_unique_names_duplicates(["foo", "bar", "baz", "bar"], return_duplicated=True), + (["foo", "bar (1)", "baz", "bar (2)"], ["bar"])) + self.assertEqual( + get_unique_names_duplicates(["x", "x", "x (1)"], return_duplicated=True), + (["x (2)", "x (3)", "x (1)"], ["x"])) + self.assertEqual( + get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"], return_duplicated=True), + (["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"], ["x (2)", "x"])) self.assertEqual( get_unique_names_duplicates(["x", "", "", None, None, "x"]), ["x (1)", "", "", None, None, "x (2)"]) + self.assertEqual( + get_unique_names_duplicates(["iris", "iris", "iris (1)", "iris (2)"], return_duplicated=True), + (["iris (3)", "iris (4)", "iris (1)", "iris (2)"], ["iris"])) + + self.assertEqual( + get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)"]), + ["iris (1) (1)", "iris (1) (2)", "iris (1) (3)"] + ) + + self.assertEqual( + get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)", "iris", "iris"]), + ["iris (1) (1)", "iris (1) (2)", "iris (1) (3)", "iris (2)", "iris (3)"] + ) def test_get_unique_names_domain(self): (attrs, classes, metas), renamed = \ diff --git a/Orange/data/util.py b/Orange/data/util.py index 3d81a57ec5d..0a583702c18 100644 --- a/Orange/data/util.py +++ b/Orange/data/util.py @@ -2,8 +2,8 @@ Data-manipulation utilities. """ import re -from collections import Counter, defaultdict -from itertools import chain +from collections import Counter +from itertools import chain, count from typing import Callable import numpy as np @@ -155,8 +155,8 @@ def get_indices(names, name): :param name: str :return: list of indices """ - return [int(a.group(2)) for x in names - for a in re.finditer(RE_FIND_INDEX.format(name), x)] + return [int(a.group(2)) for x in filter(None, names) + for a in re.finditer(RE_FIND_INDEX.format(re.escape(name)), x)] def get_unique_names(names, proposed): @@ -203,26 +203,22 @@ def get_unique_names(names, proposed): return [f"{name} ({max_index})" for name in proposed] -def get_unique_names_duplicates(proposed: list) -> list: +def get_unique_names_duplicates(proposed: list, return_duplicated=False) -> list: """ Returns list of unique names. If a name is duplicated, the - function appends the smallest available index in parentheses. + function appends the next available index in parentheses. For example, a proposed list of names `x`, `x` and `x (2)` - results in `x (1)`, `x (3)`, `x (2)`. + results in `x (3)`, `x (4)`, `x (2)`. """ - counter = Counter(proposed) - index = defaultdict(int) - names = [] - for name in proposed: - if name and counter[name] > 1: - unique_name = name - while unique_name in counter: - index[name] += 1 - unique_name = f"{name} ({index[name]})" - name = unique_name - names.append(name) - return names + indices = {name: count(max(get_indices(proposed, name), default=0) + 1) + for name, cnt in Counter(proposed).items() + if name and cnt > 1} + new_names = [f"{name} ({next(indices[name])})" if name in indices else name + for name in proposed] + if return_duplicated: + return new_names, list(indices) + return new_names def get_unique_names_domain(attributes, class_vars=(), metas=()): diff --git a/Orange/widgets/data/owfile.py b/Orange/widgets/data/owfile.py index c33c0014b4e..2e2594e1797 100644 --- a/Orange/widgets/data/owfile.py +++ b/Orange/widgets/data/owfile.py @@ -21,7 +21,7 @@ from Orange.widgets.utils.filedialogs import RecentPathsWComboMixin, \ open_filename_dialog from Orange.widgets.utils.widgetpreview import WidgetPreview -from Orange.widgets.widget import Output +from Orange.widgets.widget import Output, Msg # Backward compatibility: class RecentPath used to be defined in this module, # and it is used in saved (pickled) settings. It must be imported into the @@ -121,17 +121,19 @@ class Outputs: domain_editor = SettingProvider(DomainEditor) class Warning(widget.OWWidget.Warning): - file_too_big = widget.Msg("The file is too large to load automatically." - " Press Reload to load.") - load_warning = widget.Msg("Read warning:\n{}") - performance_warning = widget.Msg( + file_too_big = Msg("The file is too large to load automatically." + " Press Reload to load.") + load_warning = Msg("Read warning:\n{}") + performance_warning = Msg( "Categorical variables with >100 values may decrease performance.") + renamed_vars = Msg("Some variables have been renamed " + "to avoid duplicates.\n{}") class Error(widget.OWWidget.Error): - file_not_found = widget.Msg("File not found.") - missing_reader = widget.Msg("Missing reader.") - sheet_error = widget.Msg("Error listing available sheets.") - unknown = widget.Msg("Read error:\n{}") + file_not_found = Msg("File not found.") + missing_reader = Msg("Missing reader.") + sheet_error = Msg("Error listing available sheets.") + unknown = Msg("Read error:\n{}") class NoFileSelected: pass @@ -478,10 +480,13 @@ def _inspect_discrete_variables(self, domain): def apply_domain_edit(self): self.Warning.performance_warning.clear() + self.Warning.renamed_vars.clear() if self.data is None: table = None else: - domain, cols = self.domain_editor.get_domain(self.data.domain, self.data) + domain, cols, renamed = \ + self.domain_editor.get_domain(self.data.domain, self.data, + deduplicate=True) if not (domain.variables or domain.metas): table = None elif domain is self.data.domain: @@ -493,6 +498,8 @@ def apply_domain_edit(self): table.ids = np.array(self.data.ids) table.attributes = getattr(self.data, 'attributes', {}) self._inspect_discrete_variables(domain) + if renamed: + self.Warning.renamed_vars(f"Renamed: {', '.join(renamed)}") self.Outputs.data.send(table) self.apply_button.setEnabled(False) diff --git a/Orange/widgets/data/tests/test_owfile.py b/Orange/widgets/data/tests/test_owfile.py index c1cf41157eb..e3292f575cd 100644 --- a/Orange/widgets/data/tests/test_owfile.py +++ b/Orange/widgets/data/tests/test_owfile.py @@ -141,6 +141,22 @@ def test_domain_changes_are_stored(self): data = self.get_output(self.widget.Outputs.data) self.assertIsInstance(data.domain["iris"], StringVariable) + def test_rename_duplicates(self): + self.open_dataset("iris") + + idx = self.widget.domain_editor.model().createIndex(3, 0) + self.assertFalse(self.widget.Warning.renamed_vars.is_shown()) + self.widget.domain_editor.model().setData(idx, "iris", Qt.EditRole) + self.widget.apply_button.click() + data = self.get_output(self.widget.Outputs.data) + self.assertIn("iris (1)", data.domain) + self.assertIn("iris (2)", data.domain) + self.assertTrue(self.widget.Warning.renamed_vars.is_shown()) + + self.widget.domain_editor.model().setData(idx, "different iris", Qt.EditRole) + self.widget.apply_button.click() + self.assertFalse(self.widget.Warning.renamed_vars.is_shown()) + def test_variable_name_change(self): """ Test whether the name of the variable is changed correctly by @@ -155,6 +171,12 @@ def test_variable_name_change(self): data = self.get_output(self.widget.Outputs.data) self.assertIn("a", data.domain) + idx = self.widget.domain_editor.model().createIndex(3, 0) + self.widget.domain_editor.model().setData(idx, "d", Qt.EditRole) + self.widget.apply_button.click() + data = self.get_output(self.widget.Outputs.data) + self.assertIn("d", data.domain) + # rename and change to text idx = self.widget.domain_editor.model().createIndex(4, 0) self.widget.domain_editor.model().setData(idx, "b", Qt.EditRole) @@ -250,6 +272,13 @@ def test_check_column_noname(self): self.widget.domain_editor.model().setData(idx, "", Qt.EditRole) self.assertEqual(self.widget.domain_editor.model().data(idx, Qt.DisplayRole), temp) + def test_invalid_role_mode(self): + self.open_dataset("iris") + model = self.widget.domain_editor.model() + idx = model.createIndex(1, 0) + self.assertFalse(model.setData(idx, Qt.StatusTipRole, "")) + self.assertIsNone(model.data(idx, Qt.StatusTipRole)) + def test_context_match_includes_variable_values(self): file1 = """\ var diff --git a/Orange/widgets/utils/domaineditor.py b/Orange/widgets/utils/domaineditor.py index 7b358dbe20b..1b7a47cb7dd 100644 --- a/Orange/widgets/utils/domaineditor.py +++ b/Orange/widgets/utils/domaineditor.py @@ -10,6 +10,7 @@ from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \ TimeVariable, Domain +from Orange.data.util import get_unique_names_duplicates from Orange.statistics.util import unique from Orange.widgets import gui from Orange.widgets.gui import HorizontalGridDelegate @@ -61,13 +62,14 @@ def set_variables(self, variables): def rowCount(self, parent): return 0 if parent.isValid() else len(self.variables) - def columnCount(self, parent): + @staticmethod + def columnCount(parent): return 0 if parent.isValid() else Column.not_valid def data(self, index, role): row, col = index.row(), index.column() val = self.variables[row][col] - if role == Qt.DisplayRole or role == Qt.EditRole: + if role in (Qt.DisplayRole, Qt.EditRole): if col == Column.tpe: return self.type2name[val] if col == Column.place: @@ -90,8 +92,9 @@ def data(self, index, role): font = QFont() font.setBold(True) return font + return None - def setData(self, index, value, role): + def setData(self, index, value, role=Qt.EditRole): row, col = index.row(), index.column() row_data = self.variables[row] if role == Qt.EditRole: @@ -110,6 +113,7 @@ def setData(self, index, value, role): # Settings may change background colors self.dataChanged.emit(index.sibling(row, 0), index.sibling(row, 3)) return True + return False def headerData(self, i, orientation, role=Qt.DisplayRole): if orientation == Qt.Horizontal and role == Qt.DisplayRole and i < 4: @@ -130,7 +134,7 @@ def __init__(self, view, items): self.view = view self.items = items - def createEditor(self, parent, option, index): + def createEditor(self, parent, _option, index): # This ugly hack closes the combo when the user selects an item class Combo(QComboBox): def __init__(self, *args): @@ -145,6 +149,8 @@ def showPopup(self, *args): super().showPopup(*args) self.popup_shown = True + # Here, we need `self` from the closure + # pylint: disable=no-self-argument,attribute-defined-outside-init def hidePopup(me): if me.popup_shown: self.view.model().setData( @@ -250,21 +256,27 @@ def _merge(cols, force_dense=False): sparse_cols = [c if sp.issparse(c) else sp.csc_matrix(c) for c in cols] return sp.hstack(sparse_cols).tocsr() - def get_domain(self, domain, data): - """Create domain (and dataset) from changes made in the widget. - - Parameters - ---------- - domain : old domain - data : source data + def get_domain(self, domain, data, deduplicate=False): + """ + Create domain (and dataset) from changes made in the widget. Returns ------- - (new_domain, [attribute_columns, class_var_columns, meta_columns]) + + Args: + domain (Domain): original domain + data (Table): original data + deduplicate (bool): if True, variable names are deduplicated and + the result contains an additional list with names of renamed + variables + + Returns: + (new_domain, [attribute_columns, class_var_columns, meta_columns]) + or + (new_domain, [attribute_columns, class_var_columns, meta_columns], renamed) """ # Allow type-checking with type() instead of isinstance() for exact comparison # pylint: disable=unidiomatic-typecheck - variables = self.model().variables places = [[], [], []] # attributes, class_vars, metas cols = [[], [], []] # Xcols, Ycols, Mcols @@ -283,8 +295,17 @@ def numbers_are_round(var, col_data): chain(((at, Place.feature) for at in domain.attributes), ((cl, Place.class_var) for cl in domain.class_vars), ((mt, Place.meta) for mt in domain.metas)))): - return domain, [data.X, data.Y, data.metas] + if deduplicate: + return domain, [data.X, data.Y, data.metas], [] + else: + return domain, [data.X, data.Y, data.metas] + relevant_names = [var[0] for var in variables if var[2] != Place.skip] + if deduplicate: + renamed_iter = iter(get_unique_names_duplicates(relevant_names)) + else: + renamed_iter = iter(relevant_names) + renamed = [] for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \ zip(variables, chain([(at, Place.feature) for at in domain.attributes], @@ -293,13 +314,17 @@ def numbers_are_round(var, col_data): if place == Place.skip: continue + new_name = next(renamed_iter) + if new_name != name and name not in renamed: + renamed.append(name) + col_data = self._get_column(data, orig_var, orig_plc) is_sparse = sp.issparse(col_data) - if name == orig_var.name and tpe == type(orig_var): + if new_name == orig_var.name and tpe == type(orig_var): var = orig_var elif tpe == type(orig_var): - var = orig_var.copy(name=name) + var = orig_var.copy(name=new_name) elif tpe == DiscreteVariable: values = list(str(i) for i in unique(col_data) if not self._is_missing(i)) round_numbers = numbers_are_round(orig_var, col_data) @@ -307,10 +332,10 @@ def numbers_are_round(var, col_data): for x in self._iter_vals(col_data)] if round_numbers: values = [str(int(float(v))) for v in values] - var = tpe(name, values) + var = tpe(new_name, values) col_data = self._to_column(col_data, is_sparse) elif tpe == StringVariable: - var = tpe.make(name) + var = tpe.make(new_name) if type(orig_var) in [DiscreteVariable, TimeVariable]: col_data = [orig_var.repr_val(x) if not np.isnan(x) else "" for x in self._iter_vals(col_data)] @@ -324,25 +349,29 @@ def numbers_are_round(var, col_data): # in metas which are transformed to dense below col_data = self._to_column(col_data, False, dtype=object) elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable: - var = tpe.make(name) + var = tpe.make(new_name) if may_be_numeric: col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)]) for x in self._iter_vals(col_data)] col_data = self._to_column(col_data, is_sparse) else: - var = tpe(name) + var = tpe(new_name) places[place].append(var) cols[place].append(col_data) # merge columns for X, Y and metas feats = cols[Place.feature] - X = self._merge(feats) if len(feats) else np.empty((len(data), 0)) + X = self._merge(feats) if feats else np.empty((len(data), 0)) Y = self._merge(cols[Place.class_var], force_dense=True) m = self._merge(cols[Place.meta], force_dense=True) domain = Domain(*places) - return domain, [X, Y, m] + if deduplicate: + return domain, [X, Y, m], renamed + else: + return domain, [X, Y, m] - def _get_column(self, data, source_var, source_place): + @staticmethod + def _get_column(data, source_var, source_place): """ Extract column from data and preserve sparsity. """ if source_place == Place.meta: col_data = data[:, source_var].metas diff --git a/Orange/widgets/utils/tests/test_domaineditor.py b/Orange/widgets/utils/tests/test_domaineditor.py new file mode 100644 index 00000000000..c5e445cce2e --- /dev/null +++ b/Orange/widgets/utils/tests/test_domaineditor.py @@ -0,0 +1,115 @@ +import unittest + +import numpy as np + +from orangewidget.settings import SettingProvider +from orangewidget.widget import OWBaseWidget +from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \ + TimeVariable, Domain, Table +from Orange.widgets.tests.base import GuiTest +from Orange.widgets.utils.domaineditor import DomainEditor, Column + + +class MockWidget(OWBaseWidget): + name = "mock" + domain_editor = SettingProvider(DomainEditor) + + def __init__(self): + self.domain_editor = DomainEditor(self) + + +class DomainEditorTest(GuiTest): + def setUp(self): + self.widget = MockWidget() + self.editor = self.widget.domain_editor + + self.orig_variables = [ + ["d1", DiscreteVariable, 0, "x, y, z, ...", False], + ["d2", DiscreteVariable, 0, "1, 2, 3, ...", True], + ["c1", ContinuousVariable, 0, "", True], + ["d3", DiscreteVariable, 1, "4, 3, 6, ...", True], + ["s", StringVariable, 2, "", False], + ["t", TimeVariable, 2, "", True] + ] + self.domain = Domain( + [DiscreteVariable("d1", values=list("xyzw")), + DiscreteVariable("d2", values=list("12345")), + ContinuousVariable("c1")], + DiscreteVariable("d3", values=list("4368")), + [StringVariable("s"), + TimeVariable("t")]) + + def test_deduplication(self): + editor = self.editor + model = editor.model() + model.set_orig_variables(self.orig_variables) + model.reset_variables() + + data = Table.from_numpy( + self.domain, + np.zeros((1, 3)), np.zeros((1, 1)), np.array([["foo", 42]])) + + # No duplicates + + domain, _ = \ + editor.get_domain(self.domain, data) + self.assertEqual([var.name for var in domain.attributes], + ["d1", "d2", "c1"]) + self.assertEqual([var.name for var in domain.class_vars], + ["d3"]) + self.assertEqual([var.name for var in domain.metas], + ["s", "t"]) + + domain, _, renamed = \ + editor.get_domain(self.domain, data, deduplicate=True) + self.assertEqual([var.name for var in domain.attributes], + ["d1", "d2", "c1"]) + self.assertEqual([var.name for var in domain.class_vars], + ["d3"]) + self.assertEqual([var.name for var in domain.metas], + ["s", "t"]) + self.assertEqual(renamed, []) + + + # Duplicates + + model.setData(model.index(3, Column.name), "d2") + model.setData(model.index(5, Column.name), "s") + + domain, _ = \ + editor.get_domain(self.domain, data) + self.assertEqual([var.name for var in domain.attributes], + ["d1", "d2", "c1"]) + self.assertEqual([var.name for var in domain.class_vars], + ["d2"]) + self.assertEqual([var.name for var in domain.metas], + ["s", "s"]) + + domain, _, renamed = \ + editor.get_domain(self.domain, data, deduplicate=True) + self.assertEqual([var.name for var in domain.attributes], + ["d1", "d2 (1)", "c1"]) + self.assertEqual([var.name for var in domain.class_vars], + ["d2 (2)"]) + self.assertEqual([var.name for var in domain.metas], + ["s (1)", "s (2)"]) + self.assertEqual(renamed, ["d2", "s"]) + + + # Duplicates, some skipped + + model.setData(model.index(5, Column.place), "skip") + + domain, _, renamed = \ + editor.get_domain(self.domain, data, deduplicate=True) + self.assertEqual([var.name for var in domain.attributes], + ["d1", "d2 (1)", "c1"]) + self.assertEqual([var.name for var in domain.class_vars], + ["d2 (2)"]) + self.assertEqual([var.name for var in domain.metas], + ["s"]) + self.assertEqual(renamed, ["d2"]) + + +if __name__ == "__main__": + unittest.main()