Skip to content

Commit

Permalink
Merge pull request #4367 from aturanjanin/state_summary
Browse files Browse the repository at this point in the history
String formatting for the input/output summary
  • Loading branch information
janezd authored Feb 7, 2020
2 parents 3c27153 + bef62df commit b8afe13
Show file tree
Hide file tree
Showing 2 changed files with 250 additions and 0 deletions.
66 changes: 66 additions & 0 deletions Orange/widgets/utils/state_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from Orange.data import StringVariable, DiscreteVariable, ContinuousVariable, \
TimeVariable


def format_variables_string(variables):
"""
A function that formats the descriptive part of the input/output summary for
either features, targets or metas of the input dataset.
:param variables: Features, targets or metas of the input dataset
:return: A formatted string
"""
if not variables:
return '—'

agg = []
for var_type_name, var_type in [('categorical', DiscreteVariable),
('numeric', ContinuousVariable),
('time', TimeVariable),
('string', StringVariable)]:
# Disable pylint here because a `TimeVariable` is also a
# `ContinuousVariable`, and should be labelled as such. That is why
# it is necessary to check the type this way instead of using
# `isinstance`, which would fail in the above case
var_type_list = [v for v in variables if type(v) is var_type] # pylint: disable=unidiomatic-typecheck
if var_type_list:
not_shown = ' (not shown)' if issubclass(var_type, StringVariable)\
else ''
agg.append((f'{var_type_name}{not_shown}', len(var_type_list)))

attrs, counts = list(zip(*agg))
if len(attrs) > 1:
var_string = [f'{i} {j}' for i, j in zip(counts, attrs)]
var_string = f'{sum(counts)} ({", ".join(var_string)})'
elif counts[0] == 1:
var_string = attrs[0]
else:
var_string = f'{counts[0]} {attrs[0]}'
return var_string


def format_summary_details(data):
"""
A function that forms the entire descriptive part of the input/output
summary.
:param data: A dataset
:type data: Orange.data.Table
:return: A formatted string
"""
def _plural(number):
return 's' * (number != 1)

details = ''
if data:
features = format_variables_string(data.domain.attributes)
targets = format_variables_string(data.domain.class_vars)
metas = format_variables_string(data.domain.metas)

n_features = len(data.domain.variables) + len(data.domain.metas)
details = \
f'{len(data)} instance{_plural(len(data))}, ' \
f'{n_features} feature{_plural(n_features)}\n' \
f'Features: {features}\nTarget: {targets}\nMetas: {metas}'

return details
184 changes: 184 additions & 0 deletions Orange/widgets/utils/tests/test_state_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import unittest
import datetime
from collections import namedtuple

import numpy as np

from Orange.widgets.utils.state_summary import format_summary_details
from Orange.data import Table, Domain, StringVariable, ContinuousVariable, \
DiscreteVariable, TimeVariable

VarDataPair = namedtuple('VarDataPair', ['variable', 'data'])

# Continuous variable variations
continuous_full = VarDataPair(
ContinuousVariable('continuous_full'),
np.array([0, 1, 2, 3, 4], dtype=float),
)
continuous_missing = VarDataPair(
ContinuousVariable('continuous_missing'),
np.array([0, 1, 2, np.nan, 4], dtype=float),
)

# Unordered discrete variable variations
rgb_full = VarDataPair(
DiscreteVariable('rgb_full', values=['r', 'g', 'b']),
np.array([0, 1, 1, 1, 2], dtype=float),
)
rgb_missing = VarDataPair(
DiscreteVariable('rgb_missing', values=['r', 'g', 'b']),
np.array([0, 1, 1, np.nan, 2], dtype=float),
)

# Ordered discrete variable variations
ints_full = VarDataPair(
DiscreteVariable('ints_full', values=['2', '3', '4'], ordered=True),
np.array([0, 1, 1, 1, 2], dtype=float),
)
ints_missing = VarDataPair(
DiscreteVariable('ints_missing', values=['2', '3', '4'], ordered=True),
np.array([0, 1, 1, np.nan, 2], dtype=float),
)

def _to_timestamps(years):
return [datetime.datetime(year, 1, 1).timestamp() if not np.isnan(year)
else np.nan for year in years]

time_full = VarDataPair(
TimeVariable('time_full'),
np.array(_to_timestamps([2000, 2001, 2002, 2003, 2004]), dtype=float),
)
time_missing = VarDataPair(
TimeVariable('time_missing'),
np.array(_to_timestamps([2000, np.nan, 2001, 2003, 2004]), dtype=float),
)

# String variable variations
string_full = VarDataPair(
StringVariable('string_full'),
np.array(['a', 'b', 'c', 'd', 'e'], dtype=object),
)
string_missing = VarDataPair(
StringVariable('string_missing'),
np.array(['a', 'b', 'c', StringVariable.Unknown, 'e'], dtype=object),
)


def make_table(attributes, target=None, metas=None):
"""Build an instance of a table given various variables.
Parameters
----------
attributes : Iterable[Tuple[Variable, np.array]
target : Optional[Iterable[Tuple[Variable, np.array]]
metas : Optional[Iterable[Tuple[Variable, np.array]]
Returns
-------
Table
"""
attribute_vars, attribute_vals = list(zip(*attributes))
attribute_vals = np.array(attribute_vals).T

target_vars, target_vals = None, None
if target is not None:
target_vars, target_vals = list(zip(*target))
target_vals = np.array(target_vals).T

meta_vars, meta_vals = None, None
if metas is not None:
meta_vars, meta_vals = list(zip(*metas))
meta_vals = np.array(meta_vals).T

return Table.from_numpy(
Domain(attribute_vars, class_vars=target_vars, metas=meta_vars),
X=attribute_vals, Y=target_vals, metas=meta_vals,
)


class TestUtils(unittest.TestCase):
def test_details(self):
"""Check if details part of the summary is formatted correctly"""
data = Table('zoo')
n_features = len(data.domain.variables) + len(data.domain.metas)
details = f'{len(data)} instances, ' \
f'{n_features} features\n' \
f'Features: {len(data.domain.attributes)} categorical\n' \
f'Target: categorical\n' \
f'Metas: string (not shown)'
self.assertEqual(details, format_summary_details(data))

data = Table('housing')
n_features = len(data.domain.variables) + len(data.domain.metas)
details = f'{len(data)} instances, ' \
f'{n_features} features\n' \
f'Features: {len(data.domain.attributes)} numeric\n' \
f'Target: numeric\n' \
f'Metas: —'
self.assertEqual(details, format_summary_details(data))

data = Table('heart_disease')
n_features = len(data.domain.variables) + len(data.domain.metas)
details = f'{len(data)} instances, ' \
f'{n_features} features\n' \
f'Features: {len(data.domain.attributes)} ' \
f'(7 categorical, 6 numeric)\n' \
f'Target: categorical\n' \
f'Metas: —'
self.assertEqual(details, format_summary_details(data))

data = make_table(
[continuous_full, continuous_missing],
target=[rgb_full, rgb_missing], metas=[ints_full, ints_missing]
)
n_features = len(data.domain.variables) + len(data.domain.metas)
details = f'{len(data)} instances, ' \
f'{n_features} features\n' \
f'Features: {len(data.domain.attributes)} numeric\n' \
f'Target: {len(data.domain.class_vars)} categorical\n' \
f'Metas: {len(data.domain.metas)} categorical'
self.assertEqual(details, format_summary_details(data))

data = make_table(
[continuous_full, time_full, ints_full, rgb_missing],
target=[rgb_full, continuous_missing],
metas=[string_full, string_missing]
)
n_features = len(data.domain.variables) + len(data.domain.metas)
details = f'{len(data)} instances, ' \
f'{n_features} features\n' \
f'Features: {len(data.domain.attributes)} ' \
f'(2 categorical, 1 numeric, 1 time)\n' \
f'Target: {len(data.domain.class_vars)} ' \
f'(1 categorical, 1 numeric)\n' \
f'Metas: {len(data.domain.metas)} string (not shown)'
self.assertEqual(details, format_summary_details(data))

data = make_table([time_full, time_missing], target=[ints_missing],
metas=None)
details = f'{len(data)} instances, ' \
f'{len(data.domain.variables)} features\n' \
f'Features: {len(data.domain.attributes)} time\n'\
f'Target: categorical\n' \
f'Metas: —'
self.assertEqual(details, format_summary_details(data))

data = make_table([rgb_full, ints_full], target=None, metas=None)
details = f'{len(data)} instances, ' \
f'{len(data.domain.variables)} features\n' \
f'Features: {len(data.domain.variables)} categorical\n' \
f'Target: —\n' \
f'Metas: —'
self.assertEqual(details, format_summary_details(data))

data = make_table([rgb_full], target=None, metas=None)
details = f'{len(data)} instances, ' \
f'{len(data.domain.variables)} feature\n' \
f'Features: categorical\n' \
f'Target: —\n' \
f'Metas: —'
self.assertEqual(details, format_summary_details(data))

data = None
self.assertEqual('', format_summary_details(data))

0 comments on commit b8afe13

Please sign in to comment.