From ed06e3f14085e1c33b3941deeb782b7c12a62373 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 21 Feb 2020 14:37:08 +0100 Subject: [PATCH] util.get_unique_names_duplicates: Fix duplication when indexed name already exists --- Orange/data/tests/test_util.py | 21 ++++++++++++++++++++- Orange/data/util.py | 30 +++++++++++++----------------- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/Orange/data/tests/test_util.py b/Orange/data/tests/test_util.py index 0006ea4773b..0cf88f0cca5 100644 --- a/Orange/data/tests/test_util.py +++ b/Orange/data/tests/test_util.py @@ -59,10 +59,29 @@ def test_get_unique_names_from_duplicates(self): ["x (2)", "x (3)", "x (1)"]) self.assertEqual( get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]), - ["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"]) + ["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"]) + self.assertEqual( + get_unique_names_duplicates(["iris", "iris", "iris (1)"]), + ["iris (2)", "iris (3)", "iris (1)"]) + + self.assertEqual( + get_unique_names_duplicates(["foo", "bar", "baz"], return_duplicated=True), + (["foo", "bar", "baz"], [])) + self.assertEqual( + get_unique_names_duplicates(["foo", "bar", "baz", "bar"], return_duplicated=True), + (["foo", "bar (1)", "baz", "bar (2)"], ["bar"])) + self.assertEqual( + get_unique_names_duplicates(["x", "x", "x (1)"], return_duplicated=True), + (["x (2)", "x (3)", "x (1)"], ["x"])) + self.assertEqual( + get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"], return_duplicated=True), + (["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"], ["x (2)", "x"])) self.assertEqual( get_unique_names_duplicates(["x", "", "", None, None, "x"]), ["x (1)", "", "", None, None, "x (2)"]) + self.assertEqual( + get_unique_names_duplicates(["iris", "iris", "iris (1)"], return_duplicated=True), + (["iris (2)", "iris (3)", "iris (1)"], ["iris"])) def test_get_unique_names_domain(self): (attrs, classes, metas), renamed = \ diff --git a/Orange/data/util.py b/Orange/data/util.py index 273a6bb7602..6652fb0a121 100644 --- a/Orange/data/util.py +++ b/Orange/data/util.py @@ -3,7 +3,7 @@ """ import re from collections import Counter, defaultdict -from itertools import chain +from itertools import chain, count import numpy as np import bottleneck as bn @@ -153,7 +153,7 @@ def get_indices(names, name): :param name: str :return: list of indices """ - return [int(a.group(2)) for x in names + return [int(a.group(2)) for x in filter(None, names) for a in re.finditer(RE_FIND_INDEX.format(name), x)] @@ -201,26 +201,22 @@ def get_unique_names(names, proposed): return [f"{name} ({max_index})" for name in proposed] -def get_unique_names_duplicates(proposed: list) -> list: +def get_unique_names_duplicates(proposed: list, return_duplicated=False) -> list: """ Returns list of unique names. If a name is duplicated, the - function appends the smallest available index in parentheses. + function appends the next available index in parentheses. For example, a proposed list of names `x`, `x` and `x (2)` - results in `x (1)`, `x (3)`, `x (2)`. + results in `x (3)`, `x (4)`, `x (2)`. """ - counter = Counter(proposed) - index = defaultdict(int) - names = [] - for name in proposed: - if name and counter[name] > 1: - unique_name = name - while unique_name in counter: - index[name] += 1 - unique_name = f"{name} ({index[name]})" - name = unique_name - names.append(name) - return names + indices = {name: count(max(get_indices(proposed, name), default=0) + 1) + for name, cnt in Counter(proposed).items() + if name and cnt > 1} + new_names = [f"{name} ({next(indices[name])})" if name in indices else name + for name in proposed] + if return_duplicated: + return new_names, list(indices) + return new_names def get_unique_names_domain(attributes, class_vars=(), metas=()):