diff --git a/Orange/widgets/utils/state_summary.py b/Orange/widgets/utils/state_summary.py new file mode 100644 index 00000000000..17675fe5e6d --- /dev/null +++ b/Orange/widgets/utils/state_summary.py @@ -0,0 +1,66 @@ +from Orange.data import StringVariable, DiscreteVariable, ContinuousVariable, \ + TimeVariable + + +def format_variables_string(variables): + """ + A function that formats the descriptive part of the input/output summary for + either features, targets or metas of the input dataset. + + :param variables: Features, targets or metas of the input dataset + :return: A formatted string + """ + if not variables: + return '—' + + agg = [] + for var_type_name, var_type in [('categorical', DiscreteVariable), + ('numeric', ContinuousVariable), + ('time', TimeVariable), + ('string', StringVariable)]: + # Disable pylint here because a `TimeVariable` is also a + # `ContinuousVariable`, and should be labelled as such. That is why + # it is necessary to check the type this way instead of using + # `isinstance`, which would fail in the above case + var_type_list = [v for v in variables if type(v) is var_type] # pylint: disable=unidiomatic-typecheck + if var_type_list: + not_shown = ' (not shown)' if issubclass(var_type, StringVariable)\ + else '' + agg.append((f'{var_type_name}{not_shown}', len(var_type_list))) + + attrs, counts = list(zip(*agg)) + if len(attrs) > 1: + var_string = [f'{i} {j}' for i, j in zip(counts, attrs)] + var_string = f'{sum(counts)} ({", ".join(var_string)})' + elif counts[0] == 1: + var_string = attrs[0] + else: + var_string = f'{counts[0]} {attrs[0]}' + return var_string + + +def format_summary_details(data): + """ + A function that forms the entire descriptive part of the input/output + summary. + + :param data: A dataset + :type data: Orange.data.Table + :return: A formatted string + """ + def _plural(number): + return 's' * (number != 1) + + details = '' + if data: + features = format_variables_string(data.domain.attributes) + targets = format_variables_string(data.domain.class_vars) + metas = format_variables_string(data.domain.metas) + + n_features = len(data.domain.variables) + len(data.domain.metas) + details = \ + f'{len(data)} instance{_plural(len(data))}, ' \ + f'{n_features} feature{_plural(n_features)}\n' \ + f'Features: {features}\nTarget: {targets}\nMetas: {metas}' + + return details diff --git a/Orange/widgets/utils/tests/test_state_summary.py b/Orange/widgets/utils/tests/test_state_summary.py new file mode 100644 index 00000000000..df9a5fbcdd9 --- /dev/null +++ b/Orange/widgets/utils/tests/test_state_summary.py @@ -0,0 +1,184 @@ +import unittest +import datetime +from collections import namedtuple + +import numpy as np + +from Orange.widgets.utils.state_summary import format_summary_details +from Orange.data import Table, Domain, StringVariable, ContinuousVariable, \ + DiscreteVariable, TimeVariable + +VarDataPair = namedtuple('VarDataPair', ['variable', 'data']) + +# Continuous variable variations +continuous_full = VarDataPair( + ContinuousVariable('continuous_full'), + np.array([0, 1, 2, 3, 4], dtype=float), +) +continuous_missing = VarDataPair( + ContinuousVariable('continuous_missing'), + np.array([0, 1, 2, np.nan, 4], dtype=float), +) + +# Unordered discrete variable variations +rgb_full = VarDataPair( + DiscreteVariable('rgb_full', values=['r', 'g', 'b']), + np.array([0, 1, 1, 1, 2], dtype=float), +) +rgb_missing = VarDataPair( + DiscreteVariable('rgb_missing', values=['r', 'g', 'b']), + np.array([0, 1, 1, np.nan, 2], dtype=float), +) + +# Ordered discrete variable variations +ints_full = VarDataPair( + DiscreteVariable('ints_full', values=['2', '3', '4'], ordered=True), + np.array([0, 1, 1, 1, 2], dtype=float), +) +ints_missing = VarDataPair( + DiscreteVariable('ints_missing', values=['2', '3', '4'], ordered=True), + np.array([0, 1, 1, np.nan, 2], dtype=float), +) + +def _to_timestamps(years): + return [datetime.datetime(year, 1, 1).timestamp() if not np.isnan(year) + else np.nan for year in years] + +time_full = VarDataPair( + TimeVariable('time_full'), + np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float), +) +time_missing = VarDataPair( + TimeVariable('time_missing'), + np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float), +) + +# String variable variations +string_full = VarDataPair( + StringVariable('string_full'), + np.array(['a', 'b', 'c', 'd', 'e'], dtype=object), +) +string_missing = VarDataPair( + StringVariable('string_missing'), + np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object), +) + + +def make_table(attributes, target=None, metas=None): + """Build an instance of a table given various variables. + + Parameters + ---------- + attributes : Iterable[Tuple[Variable, np.array] + target : Optional[Iterable[Tuple[Variable, np.array]] + metas : Optional[Iterable[Tuple[Variable, np.array]] + + Returns + ------- + Table + + """ + attribute_vars, attribute_vals = list(zip(*attributes)) + attribute_vals = np.array(attribute_vals).T + + target_vars, target_vals = None, None + if target is not None: + target_vars, target_vals = list(zip(*target)) + target_vals = np.array(target_vals).T + + meta_vars, meta_vals = None, None + if metas is not None: + meta_vars, meta_vals = list(zip(*metas)) + meta_vals = np.array(meta_vals).T + + return Table.from_numpy( + Domain(attribute_vars, class_vars=target_vars, metas=meta_vars), + X=attribute_vals, Y=target_vals, metas=meta_vals, + ) + + +class TestUtils(unittest.TestCase): + def test_details(self): + """Check if details part of the summary is formatted correctly""" + data = Table('zoo') + n_features = len(data.domain.variables) + len(data.domain.metas) + details = f'{len(data)} instances, ' \ + f'{n_features} features\n' \ + f'Features: {len(data.domain.attributes)} categorical\n' \ + f'Target: categorical\n' \ + f'Metas: string (not shown)' + self.assertEqual(details, format_summary_details(data)) + + data = Table('housing') + n_features = len(data.domain.variables) + len(data.domain.metas) + details = f'{len(data)} instances, ' \ + f'{n_features} features\n' \ + f'Features: {len(data.domain.attributes)} numeric\n' \ + f'Target: numeric\n' \ + f'Metas: —' + self.assertEqual(details, format_summary_details(data)) + + data = Table('heart_disease') + n_features = len(data.domain.variables) + len(data.domain.metas) + details = f'{len(data)} instances, ' \ + f'{n_features} features\n' \ + f'Features: {len(data.domain.attributes)} ' \ + f'(7 categorical, 6 numeric)\n' \ + f'Target: categorical\n' \ + f'Metas: —' + self.assertEqual(details, format_summary_details(data)) + + data = make_table( + [continuous_full, continuous_missing], + target=[rgb_full, rgb_missing], metas=[ints_full, ints_missing] + ) + n_features = len(data.domain.variables) + len(data.domain.metas) + details = f'{len(data)} instances, ' \ + f'{n_features} features\n' \ + f'Features: {len(data.domain.attributes)} numeric\n' \ + f'Target: {len(data.domain.class_vars)} categorical\n' \ + f'Metas: {len(data.domain.metas)} categorical' + self.assertEqual(details, format_summary_details(data)) + + data = make_table( + [continuous_full, time_full, ints_full, rgb_missing], + target=[rgb_full, continuous_missing], + metas=[string_full, string_missing] + ) + n_features = len(data.domain.variables) + len(data.domain.metas) + details = f'{len(data)} instances, ' \ + f'{n_features} features\n' \ + f'Features: {len(data.domain.attributes)} ' \ + f'(2 categorical, 1 numeric, 1 time)\n' \ + f'Target: {len(data.domain.class_vars)} ' \ + f'(1 categorical, 1 numeric)\n' \ + f'Metas: {len(data.domain.metas)} string (not shown)' + self.assertEqual(details, format_summary_details(data)) + + data = make_table([time_full, time_missing], target=[ints_missing], + metas=None) + details = f'{len(data)} instances, ' \ + f'{len(data.domain.variables)} features\n' \ + f'Features: {len(data.domain.attributes)} time\n'\ + f'Target: categorical\n' \ + f'Metas: —' + self.assertEqual(details, format_summary_details(data)) + + data = make_table([rgb_full, ints_full], target=None, metas=None) + details = f'{len(data)} instances, ' \ + f'{len(data.domain.variables)} features\n' \ + f'Features: {len(data.domain.variables)} categorical\n' \ + f'Target: —\n' \ + f'Metas: —' + self.assertEqual(details, format_summary_details(data)) + + data = make_table([rgb_full], target=None, metas=None) + details = f'{len(data)} instances, ' \ + f'{len(data.domain.variables)} feature\n' \ + f'Features: categorical\n' \ + f'Target: —\n' \ + f'Metas: —' + self.assertEqual(details, format_summary_details(data)) + + data = None + self.assertEqual('', format_summary_details(data))