Skip to content

Commit

Permalink
File / Domain Editor: Construct domains with unique names
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Feb 20, 2020
1 parent 35231b7 commit 5ab856e
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 37 deletions.
18 changes: 10 additions & 8 deletions Orange/widgets/data/owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,16 +124,16 @@ class Warning(widget.OWWidget.Warning):
file_too_big = Msg("The file is too large to load automatically."
" Press Reload to load.")
load_warning = Msg("Read warning:\n{}")
performance_warning = widget.Msg(
performance_warning = Msg(
"Categorical variables with >100 values may decrease performance.")
renamed_vars = Msg("Some variables have been renamed "
"to avoid duplicates.\n{}")

class Error(widget.OWWidget.Error):
file_not_found = widget.Msg("File not found.")
missing_reader = widget.Msg("Missing reader.")
sheet_error = widget.Msg("Error listing available sheets.")
unknown = widget.Msg("Read error:\n{}")
file_not_found = Msg("File not found.")
missing_reader = Msg("Missing reader.")
sheet_error = Msg("Error listing available sheets.")
unknown = Msg("Read error:\n{}")

class NoFileSelected:
pass
Expand Down Expand Up @@ -484,7 +484,9 @@ def apply_domain_edit(self):
if self.data is None:
table = None
else:
domain, cols = self.domain_editor.get_domain(self.data.domain, self.data)
domain, cols, renamed = \
self.domain_editor.get_domain(self.data.domain, self.data,
deduplicate=True)
if not (domain.variables or domain.metas):
table = None
elif domain is self.data.domain:
Expand All @@ -496,8 +498,8 @@ def apply_domain_edit(self):
table.ids = np.array(self.data.ids)
table.attributes = getattr(self.data, 'attributes', {})
self._inspect_discrete_variables(domain)
if self.domain_editor.renamed_variables:
self.Warning.renamed_vars(', '.join(self.domain_editor.renamed_variables))
if renamed:
self.Warning.renamed_vars(', '.join(renamed))

self.Outputs.data.send(table)
self.apply_button.setEnabled(False)
Expand Down
71 changes: 42 additions & 29 deletions Orange/widgets/utils/domaineditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,6 @@ def __init__(self, widget):
self.place_delegate = PlaceDelegate(self, VarTableModel.places)
self.setItemDelegateForColumn(Column.place, self.place_delegate)

self.renamed_variables = []

@staticmethod
def _is_missing(x):
return str(x) in ("nan", "")
Expand Down Expand Up @@ -253,21 +251,27 @@ def _merge(cols, force_dense=False):
sparse_cols = [c if sp.issparse(c) else sp.csc_matrix(c) for c in cols]
return sp.hstack(sparse_cols).tocsr()

def get_domain(self, domain, data):
"""Create domain (and dataset) from changes made in the widget.
Parameters
----------
domain : old domain
data : source data
def get_domain(self, domain, data, deduplicate=False):
"""
Create domain (and dataset) from changes made in the widget.
Returns
-------
(new_domain, [attribute_columns, class_var_columns, meta_columns])
Args:
domain (Domain): original domain
data (Table): original data
deduplicate (bool): if True, variable names are deduplicated and
the result contains an additional list with names of renamed
variables
Returns:
(new_domain, [attribute_columns, class_var_columns, meta_columns])
or
(new_domain, [attribute_columns, class_var_columns, meta_columns], renamed)
"""
# Allow type-checking with type() instead of isinstance() for exact comparison
# pylint: disable=unidiomatic-typecheck
self.renamed_variables = []
variables = self.model().variables
places = [[], [], []] # attributes, class_vars, metas
cols = [[], [], []] # Xcols, Ycols, Mcols
Expand All @@ -286,41 +290,47 @@ def numbers_are_round(var, col_data):
chain(((at, Place.feature) for at in domain.attributes),
((cl, Place.class_var) for cl in domain.class_vars),
((mt, Place.meta) for mt in domain.metas)))):
return domain, [data.X, data.Y, data.metas]

unique_names = get_unique_names_duplicates([var[0] for var in variables])
for var, u in zip(variables, unique_names):
if var[0] != u:
self.renamed_variables.append(var[0])
var[0] = u
self.model().set_variables(variables)
if deduplicate:
return domain, [data.X, data.Y, data.metas], []
else:
return domain, [data.X, data.Y, data.metas]

relevant_names = [var[0] for var in variables if var[2] != Place.skip]
if deduplicate:
renamed_iter = iter(get_unique_names_duplicates(relevant_names))
else:
renamed_iter = iter(relevant_names)
renamed = []
for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
zip(variables,
chain([(at, Place.feature) for at in domain.attributes],
[(cl, Place.class_var) for cl in domain.class_vars],
[(mt, Place.meta) for mt in domain.metas])):
chain([(at, Place.feature) for at in domain.attributes],
[(cl, Place.class_var) for cl in domain.class_vars],
[(mt, Place.meta) for mt in domain.metas])):
if place == Place.skip:
continue

new_name = next(renamed_iter)
if new_name != name:
renamed.append(name)

col_data = self._get_column(data, orig_var, orig_plc)
is_sparse = sp.issparse(col_data)

if name == orig_var.name and tpe == type(orig_var):
if new_name == orig_var.name and tpe == type(orig_var):
var = orig_var
elif tpe == type(orig_var):
var = orig_var.copy(name=name)
var = orig_var.copy(name=new_name)
elif tpe == DiscreteVariable:
values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
round_numbers = numbers_are_round(orig_var, col_data)
col_data = [np.nan if self._is_missing(x) else values.index(str(x))
for x in self._iter_vals(col_data)]
if round_numbers:
values = [str(int(float(v))) for v in values]
var = tpe(name, values)
var = tpe(new_name, values)
col_data = self._to_column(col_data, is_sparse)
elif tpe == StringVariable:
var = tpe.make(name)
var = tpe.make(new_name)
if type(orig_var) in [DiscreteVariable, TimeVariable]:
col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
for x in self._iter_vals(col_data)]
Expand All @@ -334,13 +344,13 @@ def numbers_are_round(var, col_data):
# in metas which are transformed to dense below
col_data = self._to_column(col_data, False, dtype=object)
elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable:
var = tpe.make(name)
var = tpe.make(new_name)
if may_be_numeric:
col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)])
for x in self._iter_vals(col_data)]
col_data = self._to_column(col_data, is_sparse)
else:
var = tpe(name)
var = tpe(new_name)
places[place].append(var)
cols[place].append(col_data)

Expand All @@ -350,7 +360,10 @@ def numbers_are_round(var, col_data):
Y = self._merge(cols[Place.class_var], force_dense=True)
m = self._merge(cols[Place.meta], force_dense=True)
domain = Domain(*places)
return domain, [X, Y, m]
if deduplicate:
return domain, [X, Y, m], renamed
else:
return domain, [X, Y, m]

def _get_column(self, data, source_var, source_place):
""" Extract column from data and preserve sparsity. """
Expand Down

0 comments on commit 5ab856e

Please sign in to comment.