From 12d1e091abb396739f742b2cc45a353015a74c8c Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 21 Feb 2020 14:37:08 +0100 Subject: [PATCH] util.get_unique_names_duplicates: Fix duplication when indexed name already exists --- Orange/data/tests/test_util.py | 31 ++++++++++++++++++++++++++++++- Orange/data/util.py | 34 +++++++++++++++------------------- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/Orange/data/tests/test_util.py b/Orange/data/tests/test_util.py index 2df6c287be9..ee526bb1116 100644 --- a/Orange/data/tests/test_util.py +++ b/Orange/data/tests/test_util.py @@ -59,10 +59,39 @@ def test_get_unique_names_from_duplicates(self): ["x (2)", "x (3)", "x (1)"]) self.assertEqual( get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"]), - ["x (2) (1)", "x (1)", "x (4)", "x (2) (2)", "x (3)"]) + ["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"]) + self.assertEqual( + get_unique_names_duplicates(["iris", "iris", "iris (1)"]), + ["iris (2)", "iris (3)", "iris (1)"]) + + self.assertEqual( + get_unique_names_duplicates(["foo", "bar", "baz"], return_duplicated=True), + (["foo", "bar", "baz"], [])) + self.assertEqual( + get_unique_names_duplicates(["foo", "bar", "baz", "bar"], return_duplicated=True), + (["foo", "bar (1)", "baz", "bar (2)"], ["bar"])) + self.assertEqual( + get_unique_names_duplicates(["x", "x", "x (1)"], return_duplicated=True), + (["x (2)", "x (3)", "x (1)"], ["x"])) + self.assertEqual( + get_unique_names_duplicates(["x (2)", "x", "x", "x (2)", "x (3)"], return_duplicated=True), + (["x (2) (1)", "x (4)", "x (5)", "x (2) (2)", "x (3)"], ["x (2)", "x"])) self.assertEqual( get_unique_names_duplicates(["x", "", "", None, None, "x"]), ["x (1)", "", "", None, None, "x (2)"]) + self.assertEqual( + get_unique_names_duplicates(["iris", "iris", "iris (1)"], return_duplicated=True), + (["iris (2)", "iris (3)", "iris (1)"], ["iris"])) + + self.assertEqual( + get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)"]), + ["iris (1) (1)", "iris (1) (2)", "iris (1) (3)"] + ) + + self.assertEqual( + get_unique_names_duplicates(["iris (1) (1)", "iris (1)", "iris (1)", "iris", "iris"]), + ["iris (1) (1)", "iris (1) (2)", "iris (1) (3)", "iris (2)", "iris (3)"] + ) def test_get_unique_names_domain(self): (attrs, classes, metas), renamed = \ diff --git a/Orange/data/util.py b/Orange/data/util.py index 3d81a57ec5d..0a583702c18 100644 --- a/Orange/data/util.py +++ b/Orange/data/util.py @@ -2,8 +2,8 @@ Data-manipulation utilities. """ import re -from collections import Counter, defaultdict -from itertools import chain +from collections import Counter +from itertools import chain, count from typing import Callable import numpy as np @@ -155,8 +155,8 @@ def get_indices(names, name): :param name: str :return: list of indices """ - return [int(a.group(2)) for x in names - for a in re.finditer(RE_FIND_INDEX.format(name), x)] + return [int(a.group(2)) for x in filter(None, names) + for a in re.finditer(RE_FIND_INDEX.format(re.escape(name)), x)] def get_unique_names(names, proposed): @@ -203,26 +203,22 @@ def get_unique_names(names, proposed): return [f"{name} ({max_index})" for name in proposed] -def get_unique_names_duplicates(proposed: list) -> list: +def get_unique_names_duplicates(proposed: list, return_duplicated=False) -> list: """ Returns list of unique names. If a name is duplicated, the - function appends the smallest available index in parentheses. + function appends the next available index in parentheses. For example, a proposed list of names `x`, `x` and `x (2)` - results in `x (1)`, `x (3)`, `x (2)`. + results in `x (3)`, `x (4)`, `x (2)`. """ - counter = Counter(proposed) - index = defaultdict(int) - names = [] - for name in proposed: - if name and counter[name] > 1: - unique_name = name - while unique_name in counter: - index[name] += 1 - unique_name = f"{name} ({index[name]})" - name = unique_name - names.append(name) - return names + indices = {name: count(max(get_indices(proposed, name), default=0) + 1) + for name, cnt in Counter(proposed).items() + if name and cnt > 1} + new_names = [f"{name} ({next(indices[name])})" if name in indices else name + for name in proposed] + if return_duplicated: + return new_names, list(indices) + return new_names def get_unique_names_domain(attributes, class_vars=(), metas=()):