Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] File: Construct unique column names. #4420

Merged
merged 2 commits into from
Feb 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 8 additions & 32 deletions Orange/data/io_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, Optional, Tuple, List, Generator, Callable, Any

from ast import literal_eval
from collections import OrderedDict, Counter
from collections import OrderedDict
from functools import lru_cache
from itertools import chain, repeat
from math import isnan
Expand All @@ -20,6 +20,7 @@
StringVariable, ContinuousVariable, TimeVariable
from Orange.data.io_util import Compression, open_compressed, \
isnastr, guess_data_type, sanitize_variable
from Orange.data.util import get_unique_names_duplicates
from Orange.data.variable import VariableMeta
from Orange.util import Registry, flatten, namegen

Expand Down Expand Up @@ -92,12 +93,15 @@ def split(s):

class _ColumnProperties:
def __init__(self, valuemap=None, values=None, orig_values=None,
coltype=None, coltype_kwargs={}):
coltype=None, coltype_kwargs=None):
self.valuemap = valuemap
self.values = values
self.orig_values = orig_values
self.coltype = coltype
self.coltype_kwargs = dict(coltype_kwargs)
if coltype_kwargs is None:
self.coltype_kwargs = {}
else:
self.coltype_kwargs = dict(coltype_kwargs)


class _TableHeader:
Expand All @@ -114,7 +118,7 @@ def __init__(self, headers: List):
Header rows, to be used for constructing domain.
"""
names, types, flags = self.create_header_data(headers)
self.names = self.rename_variables(names)
self.names = get_unique_names_duplicates(names)
self.types = types
self.flags = flags

Expand Down Expand Up @@ -173,34 +177,6 @@ def _type_from_flag(flags: List[str]) -> List[str]:
def _flag_from_flag(flags: List[str]) -> List[str]:
return [Flags.join(filter(str.islower, flag)) for flag in flags]

@staticmethod
def rename_variables(names: List[str]) -> List[str]:
"""
Rename variables if necessary. Append index to the name, if the name
is duplicated.
Reusing across files still works if both files have same duplicates.

Parameters
----------
names: List
Variable names.

Returns
-------
names: List
Variable names with appended index.
"""
name_counts = Counter(names)
del name_counts[""]
if len(name_counts) != len(names) and name_counts:
uses = {name: 0 for name, count in name_counts.items() if
count > 1}
for i, name in enumerate(names):
if name in uses:
uses[name] += 1
names[i] = f"{name}_{uses[name]}"
return names


class _TableBuilder:
X_ARR, Y_ARR, M_ARR, W_ARR = range(4)
Expand Down
5 changes: 2 additions & 3 deletions Orange/data/tests/test_io_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ def setUpClass(cls):

class TestTableHeader(InitTestData):
def test_rename_variables(self):
names = ["a", "", "b", "", "a", None]
names = _TableHeader.rename_variables(names)
self.assertListEqual(names, ["a_1", "", "b", "", "a_2", None])
th = _TableHeader([["a", "", "b", "", "a"]])
self.assertListEqual(th.names, ["a (1)", "", "b", "", "a (2)"])

def test_get_header_data_0(self):
names, types, flags = _TableHeader.create_header_data([])
Expand Down
9 changes: 9 additions & 0 deletions Orange/data/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def test_get_unique_names_from_duplicates(self):
self.assertEqual(
get_unique_names_duplicates(["foo", "bar", "baz", "bar"]),
["foo", "bar (1)", "baz", "bar (2)"])
self.assertEqual(
get_unique_names_duplicates(["x", "x", "x (1)"]),
["x (2)", "x (3)", "x (1)"])
self.assertEqual(
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]),
["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"])
self.assertEqual(
get_unique_names_duplicates(["x", "", "", None, None, "x"]),
["x (1)", "", "", None, None, "x (2)"])


if __name__ == "__main__":
Expand Down
26 changes: 17 additions & 9 deletions Orange/data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Data-manipulation utilities.
"""
import re
from collections import Counter
from collections import Counter, defaultdict
from itertools import chain

import numpy as np
Expand All @@ -25,14 +25,15 @@ def one_hot(values, dtype=float):
result
2d array with ones in respective indicator columns.
"""
if not len(values):
if len(values) == 0:
return np.zeros((0, 0), dtype=dtype)
return np.eye(int(np.max(values) + 1), dtype=dtype)[np.asanyarray(values, dtype=int)]


# pylint: disable=redefined-builtin
def scale(values, min=0, max=1):
"""Return values scaled to [min, max]"""
if not len(values):
if len(values) == 0:
return np.array([])
minval = np.float_(bn.nanmin(values))
ptp = bn.nanmax(values) - minval
Expand Down Expand Up @@ -185,7 +186,8 @@ def get_unique_names(names, proposed):
Return:
str or list of str
"""
from Orange.data import Domain # prevent cyclic import
# prevent cyclic import: pylint: disable=import-outside-toplevel
from Orange.data import Domain
if isinstance(names, Domain):
names = [var.name for var in chain(names.variables, names.metas)]
if isinstance(proposed, str):
Expand All @@ -202,14 +204,20 @@ def get_unique_names(names, proposed):
def get_unique_names_duplicates(proposed: list) -> list:
"""
Returns list of unique names. If a name is duplicated, the
function appends an index in parentheses.
function appends the smallest available index in parentheses.

For example, a proposed list of names `x`, `x` and `x (2)`
results in `x (1)`, `x (3)`, `x (2)`.
"""
counter = Counter(proposed)
temp_counter = Counter()
index = defaultdict(int)
names = []
for name in proposed:
if counter[name] > 1:
temp_counter.update([name])
name = f"{name} ({temp_counter[name]})"
if name and counter[name] > 1:
unique_name = name
while unique_name in counter:
index[name] += 1
unique_name = f"{name} ({index[name]})"
name = unique_name
names.append(name)
return names
21 changes: 11 additions & 10 deletions Orange/tests/test_tab_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_read_and_save_attributes(self):
file = io.StringIO(samplefile)
table = read_tab_file(file)

f1, f2, c1, c2 = table.domain.variables
_, f2, c1, _ = table.domain.variables
self.assertIsInstance(f2, DiscreteVariable)
self.assertEqual(f2.name, "Feature 2")
self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
Expand All @@ -97,7 +97,7 @@ def test_read_and_save_attributes(self):
file = io.StringIO(saved)
table = read_tab_file(file)

f1, f2, c1, c2 = table.domain.variables
_, f2, c1, _ = table.domain.variables
self.assertIsInstance(f2, DiscreteVariable)
self.assertEqual(f2.name, "Feature 2")
self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
Expand All @@ -106,16 +106,16 @@ def test_read_and_save_attributes(self):
self.assertEqual(c1.name, "Class 1")
self.assertEqual(c1.attributes, {'x': 'a longer string'})

path = "/path/to/somewhere"
c1.attributes["path"] = path
spath = "/path/to/somewhere"
c1.attributes["path"] = spath
outf = io.StringIO()
outf.close = lambda: None
TabReader.write_file(outf, table)
outf.seek(0)

table = read_tab_file(outf)
f1, f2, c1, c2 = table.domain.variables
self.assertEqual(c1.attributes["path"], path)
_, _, c1, _ = table.domain.variables
self.assertEqual(c1.attributes["path"], spath)

def test_read_data_oneline_header(self):
samplefile = """\
Expand Down Expand Up @@ -168,9 +168,9 @@ def test_renaming(self):
table = read_tab_file(filename)
domain = table.domain
self.assertEqual([x.name for x in domain.attributes],
["a_1", "b_1", "a_2", "a_3", "c", "a_5"])
self.assertEqual([x.name for x in domain.class_vars], ["b_2", "a_4"])
self.assertEqual([x.name for x in domain.metas], ["b_3"])
["a (1)", "b (1)", "a (2)", "a (3)", "c", "a (5)"])
self.assertEqual([x.name for x in domain.class_vars], ["b (2)", "a (4)"])
self.assertEqual([x.name for x in domain.metas], ["b (3)"])
finally:
remove(filename)

Expand Down Expand Up @@ -273,7 +273,8 @@ def test_number_of_decimals(self):
self.assertEqual(data.domain["INDUS"].number_of_decimals, 2)
self.assertEqual(data.domain["AGE"].number_of_decimals, 1)

def test_many_discrete(self):
@staticmethod
def test_many_discrete():
b = io.StringIO()
b.write("Poser\nd\n\n")
b.writelines("K" + str(i) + "\n" for i in range(30000))
Expand Down