Skip to content

Commit

Permalink
Merge pull request #4420 from thocevar/file-same_column_names
Browse files Browse the repository at this point in the history
[FIX] File: Construct unique column names.
  • Loading branch information
janezd authored Feb 15, 2020
2 parents aad5488 + 45df9c1 commit 876e06e
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 54 deletions.
40 changes: 8 additions & 32 deletions Orange/data/io_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, Optional, Tuple, List, Generator, Callable, Any

from ast import literal_eval
from collections import OrderedDict, Counter
from collections import OrderedDict
from functools import lru_cache
from itertools import chain, repeat
from math import isnan
Expand All @@ -20,6 +20,7 @@
StringVariable, ContinuousVariable, TimeVariable
from Orange.data.io_util import Compression, open_compressed, \
isnastr, guess_data_type, sanitize_variable
from Orange.data.util import get_unique_names_duplicates
from Orange.data.variable import VariableMeta
from Orange.util import Registry, flatten, namegen

Expand Down Expand Up @@ -92,12 +93,15 @@ def split(s):

class _ColumnProperties:
def __init__(self, valuemap=None, values=None, orig_values=None,
coltype=None, coltype_kwargs={}):
coltype=None, coltype_kwargs=None):
self.valuemap = valuemap
self.values = values
self.orig_values = orig_values
self.coltype = coltype
self.coltype_kwargs = dict(coltype_kwargs)
if coltype_kwargs is None:
self.coltype_kwargs = {}
else:
self.coltype_kwargs = dict(coltype_kwargs)


class _TableHeader:
Expand All @@ -114,7 +118,7 @@ def __init__(self, headers: List):
Header rows, to be used for constructing domain.
"""
names, types, flags = self.create_header_data(headers)
self.names = self.rename_variables(names)
self.names = get_unique_names_duplicates(names)
self.types = types
self.flags = flags

Expand Down Expand Up @@ -173,34 +177,6 @@ def _type_from_flag(flags: List[str]) -> List[str]:
def _flag_from_flag(flags: List[str]) -> List[str]:
return [Flags.join(filter(str.islower, flag)) for flag in flags]

@staticmethod
def rename_variables(names: List[str]) -> List[str]:
"""
Rename variables if necessary. Append index to the name, if the name
is duplicated.
Reusing across files still works if both files have same duplicates.
Parameters
----------
names: List
Variable names.
Returns
-------
names: List
Variable names with appended index.
"""
name_counts = Counter(names)
del name_counts[""]
if len(name_counts) != len(names) and name_counts:
uses = {name: 0 for name, count in name_counts.items() if
count > 1}
for i, name in enumerate(names):
if name in uses:
uses[name] += 1
names[i] = f"{name}_{uses[name]}"
return names


class _TableBuilder:
X_ARR, Y_ARR, M_ARR, W_ARR = range(4)
Expand Down
5 changes: 2 additions & 3 deletions Orange/data/tests/test_io_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ def setUpClass(cls):

class TestTableHeader(InitTestData):
def test_rename_variables(self):
names = ["a", "", "b", "", "a", None]
names = _TableHeader.rename_variables(names)
self.assertListEqual(names, ["a_1", "", "b", "", "a_2", None])
th = _TableHeader([["a", "", "b", "", "a"]])
self.assertListEqual(th.names, ["a (1)", "", "b", "", "a (2)"])

def test_get_header_data_0(self):
names, types, flags = _TableHeader.create_header_data([])
Expand Down
9 changes: 9 additions & 0 deletions Orange/data/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def test_get_unique_names_from_duplicates(self):
self.assertEqual(
get_unique_names_duplicates(["foo", "bar", "baz", "bar"]),
["foo", "bar (1)", "baz", "bar (2)"])
self.assertEqual(
get_unique_names_duplicates(["x", "x", "x (1)"]),
["x (2)", "x (3)", "x (1)"])
self.assertEqual(
get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]),
["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"])
self.assertEqual(
get_unique_names_duplicates(["x", "", "", None, None, "x"]),
["x (1)", "", "", None, None, "x (2)"])


if __name__ == "__main__":
Expand Down
26 changes: 17 additions & 9 deletions Orange/data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Data-manipulation utilities.
"""
import re
from collections import Counter
from collections import Counter, defaultdict
from itertools import chain

import numpy as np
Expand All @@ -25,14 +25,15 @@ def one_hot(values, dtype=float):
result
2d array with ones in respective indicator columns.
"""
if not len(values):
if len(values) == 0:
return np.zeros((0, 0), dtype=dtype)
return np.eye(int(np.max(values) + 1), dtype=dtype)[np.asanyarray(values, dtype=int)]


# pylint: disable=redefined-builtin
def scale(values, min=0, max=1):
"""Return values scaled to [min, max]"""
if not len(values):
if len(values) == 0:
return np.array([])
minval = np.float_(bn.nanmin(values))
ptp = bn.nanmax(values) - minval
Expand Down Expand Up @@ -185,7 +186,8 @@ def get_unique_names(names, proposed):
Return:
str or list of str
"""
from Orange.data import Domain # prevent cyclic import
# prevent cyclic import: pylint: disable=import-outside-toplevel
from Orange.data import Domain
if isinstance(names, Domain):
names = [var.name for var in chain(names.variables, names.metas)]
if isinstance(proposed, str):
Expand All @@ -202,14 +204,20 @@ def get_unique_names(names, proposed):
def get_unique_names_duplicates(proposed: list) -> list:
"""
Returns list of unique names. If a name is duplicated, the
function appends an index in parentheses.
function appends the smallest available index in parentheses.
For example, a proposed list of names `x`, `x` and `x (2)`
results in `x (1)`, `x (3)`, `x (2)`.
"""
counter = Counter(proposed)
temp_counter = Counter()
index = defaultdict(int)
names = []
for name in proposed:
if counter[name] > 1:
temp_counter.update([name])
name = f"{name} ({temp_counter[name]})"
if name and counter[name] > 1:
unique_name = name
while unique_name in counter:
index[name] += 1
unique_name = f"{name} ({index[name]})"
name = unique_name
names.append(name)
return names
21 changes: 11 additions & 10 deletions Orange/tests/test_tab_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_read_and_save_attributes(self):
file = io.StringIO(samplefile)
table = read_tab_file(file)

f1, f2, c1, c2 = table.domain.variables
_, f2, c1, _ = table.domain.variables
self.assertIsInstance(f2, DiscreteVariable)
self.assertEqual(f2.name, "Feature 2")
self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
Expand All @@ -97,7 +97,7 @@ def test_read_and_save_attributes(self):
file = io.StringIO(saved)
table = read_tab_file(file)

f1, f2, c1, c2 = table.domain.variables
_, f2, c1, _ = table.domain.variables
self.assertIsInstance(f2, DiscreteVariable)
self.assertEqual(f2.name, "Feature 2")
self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
Expand All @@ -106,16 +106,16 @@ def test_read_and_save_attributes(self):
self.assertEqual(c1.name, "Class 1")
self.assertEqual(c1.attributes, {'x': 'a longer string'})

path = "/path/to/somewhere"
c1.attributes["path"] = path
spath = "/path/to/somewhere"
c1.attributes["path"] = spath
outf = io.StringIO()
outf.close = lambda: None
TabReader.write_file(outf, table)
outf.seek(0)

table = read_tab_file(outf)
f1, f2, c1, c2 = table.domain.variables
self.assertEqual(c1.attributes["path"], path)
_, _, c1, _ = table.domain.variables
self.assertEqual(c1.attributes["path"], spath)

def test_read_data_oneline_header(self):
samplefile = """\
Expand Down Expand Up @@ -168,9 +168,9 @@ def test_renaming(self):
table = read_tab_file(filename)
domain = table.domain
self.assertEqual([x.name for x in domain.attributes],
["a_1", "b_1", "a_2", "a_3", "c", "a_5"])
self.assertEqual([x.name for x in domain.class_vars], ["b_2", "a_4"])
self.assertEqual([x.name for x in domain.metas], ["b_3"])
["a (1)", "b (1)", "a (2)", "a (3)", "c", "a (5)"])
self.assertEqual([x.name for x in domain.class_vars], ["b (2)", "a (4)"])
self.assertEqual([x.name for x in domain.metas], ["b (3)"])
finally:
remove(filename)

Expand Down Expand Up @@ -273,7 +273,8 @@ def test_number_of_decimals(self):
self.assertEqual(data.domain["INDUS"].number_of_decimals, 2)
self.assertEqual(data.domain["AGE"].number_of_decimals, 1)

def test_many_discrete(self):
@staticmethod
def test_many_discrete():
b = io.StringIO()
b.write("Poser\nd\n\n")
b.writelines("K" + str(i) + "\n" for i in range(30000))
Expand Down

0 comments on commit 876e06e

Please sign in to comment.