Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Ensure unique var names in file #4431

Merged
merged 4 commits into from
Feb 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion Orange/data/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,39 @@ def test_get_unique_names_from_duplicates(self):
["x (2)", "x (3)", "x (1)"])
self.assertEqual(
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]),
["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"])
["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"])
self.assertEqual(
get_unique_names_duplicates(["iris", "iris", "iris (1)"]),
["iris (2)", "iris (3)", "iris (1)"])

self.assertEqual(
get_unique_names_duplicates(["foo", "bar", "baz"], return_duplicated=True),
(["foo", "bar", "baz"], []))
self.assertEqual(
get_unique_names_duplicates(["foo", "bar", "baz", "bar"], return_duplicated=True),
(["foo", "bar (1)", "baz", "bar (2)"], ["bar"]))
self.assertEqual(
get_unique_names_duplicates(["x", "x", "x (1)"], return_duplicated=True),
(["x (2)", "x (3)", "x (1)"], ["x"]))
self.assertEqual(
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"], return_duplicated=True),
(["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"], ["x (2)", "x"]))
self.assertEqual(
get_unique_names_duplicates(["x", "", "", None, None, "x"]),
["x (1)", "", "", None, None, "x (2)"])
self.assertEqual(
get_unique_names_duplicates(["iris", "iris", "iris (1)", "iris (2)"], return_duplicated=True),
(["iris (3)", "iris (4)", "iris (1)", "iris (2)"], ["iris"]))

self.assertEqual(
get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)"]),
["iris (1) (1)", "iris (1) (2)", "iris (1) (3)"]
)

self.assertEqual(
get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)", "iris", "iris"]),
["iris (1) (1)", "iris (1) (2)", "iris (1) (3)", "iris (2)", "iris (3)"]
)

def test_get_unique_names_domain(self):
(attrs, classes, metas), renamed = \
Expand Down
34 changes: 15 additions & 19 deletions Orange/data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Data-manipulation utilities.
"""
import re
from collections import Counter, defaultdict
from itertools import chain
from collections import Counter
from itertools import chain, count
from typing import Callable

import numpy as np
Expand Down Expand Up @@ -155,8 +155,8 @@ def get_indices(names, name):
:param name: str
:return: list of indices
"""
return [int(a.group(2)) for x in names
for a in re.finditer(RE_FIND_INDEX.format(name), x)]
return [int(a.group(2)) for x in filter(None, names)
for a in re.finditer(RE_FIND_INDEX.format(re.escape(name)), x)]


def get_unique_names(names, proposed):
Expand Down Expand Up @@ -203,26 +203,22 @@ def get_unique_names(names, proposed):
return [f"{name} ({max_index})" for name in proposed]


def get_unique_names_duplicates(proposed: list) -> list:
def get_unique_names_duplicates(proposed: list, return_duplicated=False) -> list:
"""
Returns list of unique names. If a name is duplicated, the
function appends the smallest available index in parentheses.
function appends the next available index in parentheses.

For example, a proposed list of names `x`, `x` and `x (2)`
results in `x (1)`, `x (3)`, `x (2)`.
results in `x (3)`, `x (4)`, `x (2)`.
"""
counter = Counter(proposed)
index = defaultdict(int)
names = []
for name in proposed:
if name and counter[name] > 1:
unique_name = name
while unique_name in counter:
index[name] += 1
unique_name = f"{name} ({index[name]})"
name = unique_name
names.append(name)
return names
indices = {name: count(max(get_indices(proposed, name), default=0) + 1)
for name, cnt in Counter(proposed).items()
if name and cnt > 1}
new_names = [f"{name} ({next(indices[name])})" if name in indices else name
for name in proposed]
if return_duplicated:
return new_names, list(indices)
return new_names


def get_unique_names_domain(attributes, class_vars=(), metas=()):
Expand Down
27 changes: 17 additions & 10 deletions Orange/widgets/data/owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from Orange.widgets.utils.filedialogs import RecentPathsWComboMixin, \
open_filename_dialog
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.widget import Output
from Orange.widgets.widget import Output, Msg

# Backward compatibility: class RecentPath used to be defined in this module,
# and it is used in saved (pickled) settings. It must be imported into the
Expand Down Expand Up @@ -121,17 +121,19 @@ class Outputs:
domain_editor = SettingProvider(DomainEditor)

class Warning(widget.OWWidget.Warning):
file_too_big = widget.Msg("The file is too large to load automatically."
" Press Reload to load.")
load_warning = widget.Msg("Read warning:\n{}")
performance_warning = widget.Msg(
file_too_big = Msg("The file is too large to load automatically."
" Press Reload to load.")
load_warning = Msg("Read warning:\n{}")
performance_warning = Msg(
"Categorical variables with >100 values may decrease performance.")
renamed_vars = Msg("Some variables have been renamed "
"to avoid duplicates.\n{}")

class Error(widget.OWWidget.Error):
file_not_found = widget.Msg("File not found.")
missing_reader = widget.Msg("Missing reader.")
sheet_error = widget.Msg("Error listing available sheets.")
unknown = widget.Msg("Read error:\n{}")
file_not_found = Msg("File not found.")
missing_reader = Msg("Missing reader.")
sheet_error = Msg("Error listing available sheets.")
unknown = Msg("Read error:\n{}")

class NoFileSelected:
pass
Expand Down Expand Up @@ -478,10 +480,13 @@ def _inspect_discrete_variables(self, domain):

def apply_domain_edit(self):
self.Warning.performance_warning.clear()
self.Warning.renamed_vars.clear()
if self.data is None:
table = None
else:
domain, cols = self.domain_editor.get_domain(self.data.domain, self.data)
domain, cols, renamed = \
self.domain_editor.get_domain(self.data.domain, self.data,
deduplicate=True)
if not (domain.variables or domain.metas):
table = None
elif domain is self.data.domain:
Expand All @@ -493,6 +498,8 @@ def apply_domain_edit(self):
table.ids = np.array(self.data.ids)
table.attributes = getattr(self.data, 'attributes', {})
self._inspect_discrete_variables(domain)
if renamed:
self.Warning.renamed_vars(f"Renamed: {', '.join(renamed)}")

self.Outputs.data.send(table)
self.apply_button.setEnabled(False)
Expand Down
29 changes: 29 additions & 0 deletions Orange/widgets/data/tests/test_owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,22 @@ def test_domain_changes_are_stored(self):
data = self.get_output(self.widget.Outputs.data)
self.assertIsInstance(data.domain["iris"], StringVariable)

def test_rename_duplicates(self):
self.open_dataset("iris")

idx = self.widget.domain_editor.model().createIndex(3, 0)
self.assertFalse(self.widget.Warning.renamed_vars.is_shown())
self.widget.domain_editor.model().setData(idx, "iris", Qt.EditRole)
self.widget.apply_button.click()
data = self.get_output(self.widget.Outputs.data)
self.assertIn("iris (1)", data.domain)
self.assertIn("iris (2)", data.domain)
self.assertTrue(self.widget.Warning.renamed_vars.is_shown())

self.widget.domain_editor.model().setData(idx, "different iris", Qt.EditRole)
self.widget.apply_button.click()
self.assertFalse(self.widget.Warning.renamed_vars.is_shown())

def test_variable_name_change(self):
"""
Test whether the name of the variable is changed correctly by
Expand All @@ -155,6 +171,12 @@ def test_variable_name_change(self):
data = self.get_output(self.widget.Outputs.data)
self.assertIn("a", data.domain)

idx = self.widget.domain_editor.model().createIndex(3, 0)
self.widget.domain_editor.model().setData(idx, "d", Qt.EditRole)
self.widget.apply_button.click()
data = self.get_output(self.widget.Outputs.data)
self.assertIn("d", data.domain)

# rename and change to text
idx = self.widget.domain_editor.model().createIndex(4, 0)
self.widget.domain_editor.model().setData(idx, "b", Qt.EditRole)
Expand Down Expand Up @@ -250,6 +272,13 @@ def test_check_column_noname(self):
self.widget.domain_editor.model().setData(idx, "", Qt.EditRole)
self.assertEqual(self.widget.domain_editor.model().data(idx, Qt.DisplayRole), temp)

def test_invalid_role_mode(self):
self.open_dataset("iris")
model = self.widget.domain_editor.model()
idx = model.createIndex(1, 0)
self.assertFalse(model.setData(idx, Qt.StatusTipRole, ""))
self.assertIsNone(model.data(idx, Qt.StatusTipRole))

def test_context_match_includes_variable_values(self):
file1 = """\
var
Expand Down
75 changes: 52 additions & 23 deletions Orange/widgets/utils/domaineditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from Orange.data import DiscreteVariable, ContinuousVariable, StringVariable, \
TimeVariable, Domain
from Orange.data.util import get_unique_names_duplicates
from Orange.statistics.util import unique
from Orange.widgets import gui
from Orange.widgets.gui import HorizontalGridDelegate
Expand Down Expand Up @@ -61,13 +62,14 @@ def set_variables(self, variables):
def rowCount(self, parent):
return 0 if parent.isValid() else len(self.variables)

def columnCount(self, parent):
@staticmethod
def columnCount(parent):
return 0 if parent.isValid() else Column.not_valid

def data(self, index, role):
row, col = index.row(), index.column()
val = self.variables[row][col]
if role == Qt.DisplayRole or role == Qt.EditRole:
if role in (Qt.DisplayRole, Qt.EditRole):
if col == Column.tpe:
return self.type2name[val]
if col == Column.place:
Expand All @@ -90,8 +92,9 @@ def data(self, index, role):
font = QFont()
font.setBold(True)
return font
return None

def setData(self, index, value, role):
def setData(self, index, value, role=Qt.EditRole):
row, col = index.row(), index.column()
row_data = self.variables[row]
if role == Qt.EditRole:
Expand All @@ -110,6 +113,7 @@ def setData(self, index, value, role):
# Settings may change background colors
self.dataChanged.emit(index.sibling(row, 0), index.sibling(row, 3))
return True
return False

def headerData(self, i, orientation, role=Qt.DisplayRole):
if orientation == Qt.Horizontal and role == Qt.DisplayRole and i < 4:
Expand All @@ -130,7 +134,7 @@ def __init__(self, view, items):
self.view = view
self.items = items

def createEditor(self, parent, option, index):
def createEditor(self, parent, _option, index):
# This ugly hack closes the combo when the user selects an item
class Combo(QComboBox):
def __init__(self, *args):
Expand All @@ -145,6 +149,8 @@ def showPopup(self, *args):
super().showPopup(*args)
self.popup_shown = True

# Here, we need `self` from the closure
# pylint: disable=no-self-argument,attribute-defined-outside-init
def hidePopup(me):
if me.popup_shown:
self.view.model().setData(
Expand Down Expand Up @@ -250,21 +256,27 @@ def _merge(cols, force_dense=False):
sparse_cols = [c if sp.issparse(c) else sp.csc_matrix(c) for c in cols]
return sp.hstack(sparse_cols).tocsr()

def get_domain(self, domain, data):
"""Create domain (and dataset) from changes made in the widget.

Parameters
----------
domain : old domain
data : source data
def get_domain(self, domain, data, deduplicate=False):
"""
Create domain (and dataset) from changes made in the widget.

Returns
-------
(new_domain, [attribute_columns, class_var_columns, meta_columns])

Args:
domain (Domain): original domain
data (Table): original data
deduplicate (bool): if True, variable names are deduplicated and
the result contains an additional list with names of renamed
variables

Returns:
(new_domain, [attribute_columns, class_var_columns, meta_columns])
or
(new_domain, [attribute_columns, class_var_columns, meta_columns], renamed)
"""
# Allow type-checking with type() instead of isinstance() for exact comparison
# pylint: disable=unidiomatic-typecheck

variables = self.model().variables
places = [[], [], []] # attributes, class_vars, metas
cols = [[], [], []] # Xcols, Ycols, Mcols
Expand All @@ -283,8 +295,17 @@ def numbers_are_round(var, col_data):
chain(((at, Place.feature) for at in domain.attributes),
((cl, Place.class_var) for cl in domain.class_vars),
((mt, Place.meta) for mt in domain.metas)))):
return domain, [data.X, data.Y, data.metas]
if deduplicate:
return domain, [data.X, data.Y, data.metas], []
else:
return domain, [data.X, data.Y, data.metas]

relevant_names = [var[0] for var in variables if var[2] != Place.skip]
if deduplicate:
renamed_iter = iter(get_unique_names_duplicates(relevant_names))
else:
renamed_iter = iter(relevant_names)
renamed = []
for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
zip(variables,
chain([(at, Place.feature) for at in domain.attributes],
Expand All @@ -293,24 +314,28 @@ def numbers_are_round(var, col_data):
if place == Place.skip:
continue

new_name = next(renamed_iter)
if new_name != name and name not in renamed:
renamed.append(name)

col_data = self._get_column(data, orig_var, orig_plc)
is_sparse = sp.issparse(col_data)

if name == orig_var.name and tpe == type(orig_var):
if new_name == orig_var.name and tpe == type(orig_var):
var = orig_var
elif tpe == type(orig_var):
var = orig_var.copy(name=name)
var = orig_var.copy(name=new_name)
elif tpe == DiscreteVariable:
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
round_numbers = numbers_are_round(orig_var, col_data)
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
for x in self._iter_vals(col_data)]
if round_numbers:
values = [str(int(float(v))) for v in values]
var = tpe(name, values)
var = tpe(new_name, values)
col_data = self._to_column(col_data, is_sparse)
elif tpe == StringVariable:
var = tpe.make(name)
var = tpe.make(new_name)
if type(orig_var) in [DiscreteVariable, TimeVariable]:
col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
for x in self._iter_vals(col_data)]
Expand All @@ -324,25 +349,29 @@ def numbers_are_round(var, col_data):
# in metas which are transformed to dense below
col_data = self._to_column(col_data, False, dtype=object)
elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable:
var = tpe.make(name)
var = tpe.make(new_name)
if may_be_numeric:
col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)])
for x in self._iter_vals(col_data)]
col_data = self._to_column(col_data, is_sparse)
else:
var = tpe(name)
var = tpe(new_name)
places[place].append(var)
cols[place].append(col_data)

# merge columns for X, Y and metas
feats = cols[Place.feature]
X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
X = self._merge(feats) if feats else np.empty((len(data), 0))
Y = self._merge(cols[Place.class_var], force_dense=True)
m = self._merge(cols[Place.meta], force_dense=True)
domain = Domain(*places)
return domain, [X, Y, m]
if deduplicate:
return domain, [X, Y, m], renamed
else:
return domain, [X, Y, m]

def _get_column(self, data, source_var, source_place):
@staticmethod
def _get_column(data, source_var, source_place):
""" Extract column from data and preserve sparsity. """
if source_place == Place.meta:
col_data = data[:, source_var].metas
Expand Down
Loading