From c441019ec0bc2c8cb4d10e99bd89c090ce8fc0ae Mon Sep 17 00:00:00 2001 From: Vesna Tanko Date: Tue, 28 May 2019 13:22:42 +0200 Subject: [PATCH] Pivot: New widget --- Orange/widgets/data/icons/Pivot.svg | 26 + Orange/widgets/data/owpivot.py | 861 ++++++++++++++++++++++ Orange/widgets/data/tests/test_owpivot.py | 612 +++++++++++++++ 3 files changed, 1499 insertions(+) create mode 100644 Orange/widgets/data/icons/Pivot.svg create mode 100644 Orange/widgets/data/owpivot.py create mode 100644 Orange/widgets/data/tests/test_owpivot.py diff --git a/Orange/widgets/data/icons/Pivot.svg b/Orange/widgets/data/icons/Pivot.svg new file mode 100644 index 00000000000..7429738f5e3 --- /dev/null +++ b/Orange/widgets/data/icons/Pivot.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + diff --git a/Orange/widgets/data/owpivot.py b/Orange/widgets/data/owpivot.py new file mode 100644 index 00000000000..4f7c378070f --- /dev/null +++ b/Orange/widgets/data/owpivot.py @@ -0,0 +1,861 @@ +# pylint: disable=missing-docstring +from typing import Iterable, Set +from collections import defaultdict +from itertools import product + +import numpy as np +from scipy import sparse as sp + +from AnyQt.QtCore import (Qt, QSize, QItemSelection, QItemSelectionModel, + pyqtSignal) +from AnyQt.QtGui import QStandardItem, QColor, QStandardItemModel +from AnyQt.QtWidgets import (QTableView, QSizePolicy, QHeaderView, + QStyledItemDelegate, QCheckBox, QFrame) + +from Orange.data import (Table, DiscreteVariable, Variable, Domain, + ContinuousVariable) +from Orange.data.filter import FilterContinuous, FilterDiscrete, Values +from Orange.statistics.util import (nanmin, nanmax, nanunique, nansum, nanvar, + nanmean, nanmedian, nanmode, bincount) +from Orange.util import Enum +from Orange.widgets import gui +from Orange.widgets.settings import (Setting, ContextSetting, + DomainContextHandler) +from Orange.widgets.utils.sql import check_sql_input +from Orange.widgets.utils.itemmodels import DomainModel +from Orange.widgets.utils.widgetpreview import WidgetPreview +from Orange.widgets.widget import OWWidget, Input, Output, Msg + + +BorderRole = next(gui.OrangeUserRole) +BorderColorRole = next(gui.OrangeUserRole) + + +class AggregationFunctionsEnum(Enum): + (Count, Count_defined, Sum, Mean, Min, Max, + Mode, Median, Var, Majority) = range(10) + + def __init__(self, *_, **__): + super().__init__() + self.func = None + + @property + def value(self): + return self._value_ + + def __call__(self, *args): + return self.func(args) # pylint: disable=not-callable + + def __str__(self): + return self._name_.replace("_", " ") + + def __gt__(self, other): + return self._value_ > other.value + + +class Pivot: + Functions = AggregationFunctionsEnum + (Count, Count_defined, Sum, Mean, Min, Max, + Mode, Median, Var, Majority) = Functions + + AutonomousFunctions = (Count,) + AnyVarFunctions = (Count_defined,) + ContVarFunctions = (Sum, Mean, Min, Max, Mode, Median, Var) + DiscVarFunctions = (Majority,) + + class Tables: + table = None # type: Table + total_h = None # type: Table + total_v = None # type: Table + total = None # type: Table + + def __call__(self): + return self.table, self.total_h, self.total_v, self.total + + def __init__(self, table: Table, agg_funs: Iterable[Functions], + row_var: Variable, col_var: Variable = None, + val_var: Variable = None): + self._group_tables = self.Tables() + self._pivot_tables = self.Tables() + self._table = table + self._row_var = row_var + self._col_var = col_var if col_var else row_var + + if not table: + return + if not self._row_var.is_primitive(): + raise TypeError("Row variable should be DiscreteVariable" + " or ContinuousVariable") + if self._col_var and not self._col_var.is_discrete: + raise TypeError("Column variable should be DiscreteVariable") + + self._row_var_col = table.get_column_view(row_var)[0].astype(np.float) + self._col_var_col = table.get_column_view(self._col_var)[0].astype(np.float) + self._row_var_groups = nanunique(self._row_var_col) + self._col_var_groups = nanunique(self._col_var_col) + + self._total_var = DiscreteVariable("Total", values=["total"]) + self._current_agg_functions = sorted(agg_funs) + self._indepen_agg_done = {} # type: Dict[Functions, int] + self._depen_agg_done = {} # type: Dict[Functions, Dict[Variable, int]] + + self._initialize(agg_funs, val_var) + + @property + def group_table(self) -> Table: + table = self._group_tables.table + if not table or len(table) == 0: + return None + indices = [0, 1] if not self.single_var_grouping else [0] + for f in self._current_agg_functions: + if f in self._indepen_agg_done: + indices.append(self._indepen_agg_done[f]) + for v in self._table.domain.variables + self._table.domain.metas: + for f in self._current_agg_functions: + if f in self._depen_agg_done and v in self._depen_agg_done[f]: + indices.append(self._depen_agg_done[f][v]) + return table[:, indices] + + @property + def pivot_table(self) -> Table: + return self._pivot_tables.table + + @property + def pivot_total_h(self) -> Table: + return self._pivot_tables.total_h + + @property + def pivot_total_v(self) -> Table: + return self._pivot_tables.total_v + + @property + def pivot_total(self) -> Table: + return self._pivot_tables.total + + @property + def pivot_tables(self) -> Table: + return self._pivot_tables() + + @property + def single_var_grouping(self) -> bool: + return self._row_var is self._col_var + + def update_group_table(self, agg_funs: Iterable[Functions], + val_var: Variable = None): + if not self._group_tables: + return + self._current_agg_functions = sorted(agg_funs) + agg_funs = set(self._indepen_agg_done.keys()) | \ + set(self._depen_agg_done.keys()) | set(agg_funs) + self._initialize(sorted(agg_funs), val_var) + + def _initialize(self, agg_funs, val_var): + var_indep_funs, var_dep_funs = self.__group_aggregations(agg_funs) + self._create_group_tables(var_indep_funs, var_dep_funs) + self.__reference_aggregations(var_indep_funs, var_dep_funs) + self._create_pivot_tables(val_var) + + def __group_aggregations(self, agg_funs): + auto_funcs = self.AutonomousFunctions + var_indep_funs = [fun for fun in agg_funs if fun in auto_funcs] + var_dep_funs = [] + prod = product(self._table.domain.variables + self._table.domain.metas, + [fun for fun in agg_funs if fun not in auto_funcs]) + for var, fun in prod: + if self.__include_aggregation(fun, var): + var_dep_funs.append((var, fun)) + return var_indep_funs, var_dep_funs + + def __include_aggregation(self, fun, var): + return fun in self.ContVarFunctions and var.is_continuous or \ + fun in self.DiscVarFunctions and var.is_discrete or \ + fun in self.AnyVarFunctions + + def __reference_aggregations(self, var_indep_funs, var_dep_funs): + self._indepen_agg_done = {} + self._depen_agg_done = defaultdict(dict) + i = 1 - int(bool(self.single_var_grouping)) + for i, fun in enumerate(var_indep_funs, i + 1): + self._indepen_agg_done[fun] = i + for j, (var, fun) in enumerate(var_dep_funs, i + 1): + self._depen_agg_done[fun].update({var: j}) + + def _create_group_tables(self, var_indep_funs, var_dep_funs): + attrs = [ContinuousVariable(f"({str(fun).lower()})") + for fun in var_indep_funs] + for var, fun in var_dep_funs: + name = f"{var.name} ({str(fun).lower()})" + if fun in self.DiscVarFunctions: + attrs.append(DiscreteVariable(name, var.values)) + else: + attrs.append(ContinuousVariable(name)) + args = (var_indep_funs, var_dep_funs, attrs) + for t, var in (("table", None), ("total_h", self._col_var), + ("total_v", self._row_var), ("total", self._total_var)): + setattr(self._group_tables, t, self.__get_group_table(var, *args)) + + def __get_group_table(self, var, var_indep_funs, var_dep_funs, attrs): + if var is self._total_var: + group_tab = self._group_tables.total + offset = int(bool(not self.single_var_grouping)) + leading_vars = [self._total_var] + combs = np.array([[0]]) + sub_table_getter = lambda x: \ + self._table[np.where((~np.isnan(self._row_var_col)) & + (~np.isnan(self._col_var_col)))[0]] + elif var is self._row_var or self.single_var_grouping: + group_tab = self._group_tables.total_v + offset = int(bool(not self.single_var_grouping)) + leading_vars = [self._row_var] + combs = self._row_var_groups[:, None] + sub_table_getter = lambda x: \ + self._table[np.where((~np.isnan(self._col_var_col)) & + (self._row_var_col == x[0]))[0]] + elif var is self._col_var: + group_tab = self._group_tables.total_h + offset = int(bool(not self.single_var_grouping)) + leading_vars = [self._col_var] + combs = self._col_var_groups[:, None] + sub_table_getter = lambda x: \ + self._table[np.where((~np.isnan(self._row_var_col)) & + (self._col_var_col == x[0]))[0]] + else: + group_tab = self._group_tables.table + offset = 0 + leading_vars = [self._row_var, self._col_var] + combs = np.array(list(product(self._row_var_groups, + self._col_var_groups))) + sub_table_getter = lambda x: \ + self._table[np.where((self._row_var_col == x[0]) + & (self._col_var_col == x[1]))[0]] + + if not combs.shape[0]: + return None + + n = len(var_indep_funs) + len(var_dep_funs) + X = np.zeros((len(combs), n), dtype=float) + for i, comb in enumerate(combs): + sub_table = sub_table_getter(comb) + j = -1 + for j, fun in enumerate(var_indep_funs): + if fun in self._indepen_agg_done: + # TODO - optimize - after this line is executed, + # the whole column is already set + X[:, j] = group_tab.X[:, self._indepen_agg_done[fun] - offset] + else: + X[i, j] = fun(sub_table) + for k, (v, fun) in enumerate(var_dep_funs, j + 1): + if fun in self._depen_agg_done: + X[:, k] = group_tab.X[:, self._depen_agg_done[fun][v] - offset] + else: + X[i, k] = fun(sub_table.get_column_view(v)[0]) + return Table(Domain(leading_vars + attrs), np.hstack((combs, X))) + + def update_pivot_table(self, val_var: Variable): + self._create_pivot_tables(val_var) + + def _create_pivot_tables(self, val_var): + if not self._group_tables.table: + self._pivot_tables = self.Tables() + return + + agg_funs = [fun for fun in self._current_agg_functions + if fun in self.AutonomousFunctions + or val_var and self.__include_aggregation(fun, val_var)] + X, X_h, X_v, X_t = self.__get_pivot_tab_x(val_var, agg_funs) + dom, dom_h, dom_v, dom_t = self.__get_pivot_tab_domain( + val_var, X, X_h, X_v, X_t, agg_funs) + for t, d, x in (("table", dom, X), ("total_h", dom_h, X_h), + ("total_v", dom_v, X_v), ("total", dom_t, X_t)): + setattr(self._pivot_tables, t, Table(d, x)) + + # pylint: disable=invalid-name + def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs): + def map_values(index, _X): + values = np.unique(_X[:, index]) + values = np.delete(values, np.where(values == "nan")[0]) + for j, value in enumerate(values): + _X[:, index][_X[:, index] == value] = j + return values + + vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)] + if not val_var or val_var.is_continuous: + cv = ContinuousVariable + attrs = [[cv(f"{v}", 1) for v in vals]] * 2 + attrs.extend([[cv("Total", 1)]] * 2) + else: + attrs = [] + for x in (X, X_h): + attrs.append([DiscreteVariable(f"{v}", map_values(i, x)) + for i, v in enumerate(vals, 2)]) + for x in (X_v, X_t): + attrs.append([DiscreteVariable("Total", map_values(0, x))]) + row_var_h = DiscreteVariable(self._row_var.name, values=["Total"]) + aggr_attr = DiscreteVariable("Aggregate", [str(f) for f in agg_funs]) + return (Domain([self._row_var, aggr_attr] + attrs[0]), + Domain([row_var_h, aggr_attr] + attrs[1]), + Domain(attrs[2]), Domain(attrs[3])) + + def __get_pivot_tab_x(self, val_var, agg_funs): + gt = self._group_tables + n_fun = len(agg_funs) + n_rows, n_cols = len(self._row_var_groups), len(self._col_var_groups) + kwargs = {"fill_value": np.nan, "dtype": float} \ + if not val_var or val_var.is_continuous \ + else {"fill_value": "", "dtype": object} + X = np.full((n_rows * n_fun, 2 + n_cols), **kwargs) + X_h = np.full((n_fun, 2 + n_cols), **kwargs) + X_v = np.full((n_rows * n_fun, 1), **kwargs) + X_t = np.full((n_fun, 1), **kwargs) + for i, fun in enumerate(agg_funs): + args = (val_var, fun) + X[i::n_fun, 2:] = self.__rows_for_function(n_rows, n_cols, *args) + X[i::n_fun, :2] = np.array([[row_val, agg_funs.index(fun)] for + row_val in self._row_var_groups]) + X_h[i, :2] = 0, agg_funs.index(fun) + X_h[i, 2:] = self.__total_for_function(gt.total_h, *args) + X_v[i::n_fun, 0] = self.__total_for_function(gt.total_v, *args) + X_t[i] = self.__total_for_function(gt.total, *args) + return X, X_h, X_v, X_t + + def __total_for_function(self, group_tab, val_var, fun): + ref = self._indepen_agg_done.get(fun, None) \ + or self._depen_agg_done[fun][val_var] + ref -= int(bool(not self.single_var_grouping)) + return self.__check_continuous(val_var, group_tab.X[:, ref], fun) + + def __rows_for_function(self, n_rows, n_cols, val_var, fun): + ref = self._indepen_agg_done.get(fun, None) \ + or self._depen_agg_done[fun][val_var] + column = self._group_tables.table.X[:, ref] + if self.single_var_grouping: + rows = np.full((n_rows, n_cols), fun(np.array([]), ), dtype=float) + rows[np.diag_indices_from(rows)] = column + else: + rows = column.reshape(n_rows, n_cols) + return self.__check_continuous(val_var, rows, fun) + + def __check_continuous(self, val_var, column, fun): + if val_var and not val_var.is_continuous: + column = column.astype(str) + if fun in self.DiscVarFunctions: + for j, val in enumerate(val_var.values): + column[column == str(float(j))] = val + return column + + @staticmethod + def count_defined(x): + if x.shape[0] == 0: + return 0 + if x.size and np.issubdtype(x.dtype, np.number) and not sp.issparse(x): + nans = np.isnan(x).sum(axis=0) + elif sp.issparse(x) and x.size: + nans = np.bincount(x.nonzero()[1], minlength=x.shape[1]) + x = x.tocsc() + else: + x_str = x.astype(str) + nans = ((x_str == "nan") | (x_str == "")).sum(axis=0) \ + if x.size else np.zeros(x.shape[1]) + return x.shape[0] - nans + + @staticmethod + def stat(x, f): + return f(x.astype(np.float), axis=0) if x.shape[0] > 0 else np.nan + + @staticmethod + def mode(x): + return Pivot.stat(x, nanmode).mode if x.shape[0] > 0 else np.nan + + @staticmethod + def majority(x): + if x.shape[0] == 0: + return np.nan + counts = bincount(x)[0] + return np.argmax(counts) if counts.shape[0] else np.nan + + Count.func = lambda x: len(x[0]) + Count_defined.func = lambda x: Pivot.count_defined(x[0]) + Sum.func = lambda x: nansum(x[0], axis=0) if x[0].shape[0] > 0 else 0 + Mean.func = lambda x: Pivot.stat(x[0], nanmean) + Min.func = lambda x: Pivot.stat(x[0], nanmin) + Max.func = lambda x: Pivot.stat(x[0], nanmax) + Median.func = lambda x: Pivot.stat(x[0], nanmedian) + Mode.func = lambda x: Pivot.mode(x[0]) + Var.func = lambda x: Pivot.stat(x[0], nanvar) + Majority.func = lambda x: Pivot.majority(x[0]) + + +class BorderedItemDelegate(QStyledItemDelegate): + def paint(self, painter, option, index): + """Overloads `paint` to draw borders""" + QStyledItemDelegate.paint(self, painter, option, index) + if index.data(BorderRole): + painter.save() + painter.setPen(index.data(BorderColorRole)) + rect = option.rect + painter.drawLine(rect.topLeft(), rect.topRight()) + painter.restore() + + +class PivotTableView(QTableView): + selection_changed = pyqtSignal() + + TOTAL_STRING = "Total" + + def __init__(self): + super().__init__(editTriggers=QTableView.NoEditTriggers) + self._n_classesv = None # number of row_feature values + self._n_classesh = None # number of col_feature values + self._n_agg_func = None # number of aggregation functions + self._n_leading_rows = None # number of leading rows + self._n_leading_cols = None # number of leading columns + + self.table_model = QStandardItemModel(self) + self.setModel(self.table_model) + self.horizontalHeader().hide() + self.verticalHeader().hide() + self.horizontalHeader().setMinimumSectionSize(60) + self.setShowGrid(False) + self.setSizePolicy(QSizePolicy.MinimumExpanding, + QSizePolicy.MinimumExpanding) + self.setItemDelegate(BorderedItemDelegate()) + self.pressed.connect(self.__cell_clicked) + self.clicked.connect(self.__cell_clicked) + self.entered.connect(self.__cell_entered) + self.__clicked_cell = None + + @property + def add_agg_column(self) -> bool: + return self._n_agg_func > 1 + + def __cell_entered(self, model_index): + if self.__clicked_cell is None: + return + index = self.table_model.index + selection = None + i_end, j_end = model_index.row(), model_index.column() + i_start, j_start = self.__clicked_cell + i_start, i_end = sorted([i_start, i_end]) + j_start, j_end = sorted([j_start, j_end]) + if i_start >= self._n_leading_rows and j_start >= self._n_leading_cols: + i_start = (i_start - self._n_leading_rows) // self._n_agg_func * \ + self._n_agg_func + self._n_leading_rows + i_end = (i_end - self._n_leading_rows) // self._n_agg_func * \ + self._n_agg_func + self._n_leading_rows + self._n_agg_func - 1 + start, end = index(i_start, j_start), index(i_end, j_end) + selection = QItemSelection(start, end) + if selection is not None: + self.selectionModel().select( + selection, QItemSelectionModel.ClearAndSelect) + self.selection_changed.emit() + + def __cell_clicked(self, model_index): + i, j = model_index.row(), model_index.column() + self.__clicked_cell = (i, j) + m, n = self.table_model.rowCount(), self.table_model.columnCount() + index = self.table_model.index + selection = None + if i > m - self._n_agg_func - 1 and j == n - 1: + start_index = index(self._n_leading_rows, self._n_leading_cols) + selection = QItemSelection(start_index, index(m - 1, n - 1)) + elif i == self._n_leading_rows - 1 or i > m - self._n_agg_func - 1: + start_index = index(self._n_leading_rows, j) + selection = QItemSelection(start_index, index(m - 1, j)) + elif j in (self._n_leading_cols - 1, n - 1, 1): + i_start = (i - self._n_leading_rows) // self._n_agg_func * \ + self._n_agg_func + self._n_leading_rows + i_end = i_start + self._n_agg_func - 1 + start_index = index(i_start, self._n_leading_cols) + selection = QItemSelection(start_index, index(i_end, n - 1)) + elif i >= self._n_leading_rows and j >= self._n_leading_cols: + i_start = (i - self._n_leading_rows) // self._n_agg_func * \ + self._n_agg_func + self._n_leading_rows + i_end = i_start + self._n_agg_func - 1 + selection = QItemSelection(index(i_start, j), index(i_end, j)) + + if selection is not None: + self.selectionModel().select( + selection, QItemSelectionModel.ClearAndSelect) + + def mouseReleaseEvent(self, e): + super().mouseReleaseEvent(e) + self.selection_changed.emit() + + def update_table(self, titleh: str, titlev: str, table: Table, + table_total_h: Table, table_total_v: Table, + table_total: Table): + self.clear() + if not table: + return + + self._initialize(table, table_total_h) + self._set_headers(titleh, titlev, table) + self._set_values(table[:, 2:]) + self._set_totals(table_total_h[:, 2:], table_total_v, table_total) + self._draw_lines() + self._resize(table) + + def _initialize(self, table, table_total_h): + self._n_classesv = int(len(table) / len(table_total_h)) + self._n_classesh = table.X.shape[1] - 2 + self._n_agg_func = len(table_total_h) + self._n_leading_rows = 2 + self._n_leading_cols = 2 + int(len(table_total_h) > 1) + + def _set_headers(self, titleh, titlev, table): + self.__set_horizontal_title(titleh) + self.__set_vertical_title(titlev) + self.__set_flags_title() + self.__set_horizontal_headers(table) + self.__set_vertical_headers(table) + + def __set_horizontal_title(self, titleh): + item = QStandardItem() + item.setData(titleh, Qt.DisplayRole) + item.setTextAlignment(Qt.AlignCenter) + self.table_model.setItem(0, self._n_leading_cols, item) + self.setSpan(0, self._n_leading_cols, 1, self._n_classesh + 3) + + def __set_vertical_title(self, titlev): + item = QStandardItem() + item.setData(titlev, Qt.DisplayRole) + item.setTextAlignment(Qt.AlignHCenter | Qt.AlignBottom) + self.setItemDelegateForColumn(0, gui.VerticalItemDelegate(extend=True)) + self.table_model.setItem(self._n_leading_rows, 0, item) + row_span = self._n_classesv * self._n_agg_func + 1 + self.setSpan(self._n_leading_rows, 0, row_span, 1) + + def __set_flags_title(self): + item = self.table_model.item(0, self._n_leading_cols) + item.setFlags(Qt.NoItemFlags) + item = self.table_model.item(self._n_leading_rows, 0) + item.setFlags(Qt.NoItemFlags) + for i, j in product(range(self._n_leading_rows), + range(self._n_leading_cols)): + item = QStandardItem() + item.setFlags(Qt.NoItemFlags) + self.table_model.setItem(i, j, item) + + def __set_horizontal_headers(self, table): + labels = [a.name for a in table.domain[1:]] + [self.TOTAL_STRING] + if not self.add_agg_column: + labels[0] = str(table[0, 1]) + for i, label in enumerate(labels, self._n_leading_cols - 1): + self.table_model.setItem(1, i, self._create_header_item(label)) + + def __set_vertical_headers(self, table): + labels = [(str(row[0]), str(row[1])) for row in table] + i = self._n_leading_rows - 1 + for i, (l1, l2) in enumerate(labels, self._n_leading_rows): + l1 = "" if (i - self._n_leading_rows) % self._n_agg_func else l1 + self.table_model.setItem(i, 1, self._create_header_item(l1)) + if self.add_agg_column: + self.table_model.setItem(i, 2, self._create_header_item(l2)) + + if self.add_agg_column: + labels = [str(row[1]) for row in table[:self._n_agg_func]] + start = self._n_leading_rows + self._n_agg_func * self._n_classesv + for j, l2 in enumerate(labels, i + 1): + l1 = self.TOTAL_STRING if j == start else "" + self.table_model.setItem(j, 1, self._create_header_item(l1)) + self.table_model.setItem(j, 2, self._create_header_item(l2)) + else: + item = self._create_header_item(self.TOTAL_STRING) + self.table_model.setItem(i + 1, 1, item) + + def _set_values(self, table): + for i, j in product(range(len(table)), range(len(table[0]))): + item = self._create_value_item(str(table[i, j])) + self.table_model.setItem(i + self._n_leading_rows, + j + self._n_leading_cols, item) + + def _set_totals(self, table_total_h, table_total_v, table_total): + def set_total_item(table, get_row, get_col): + for i, j in product(range(len(table)), range(len(table[0]))): + item = self._create_header_item(str(table[i, j])) + self.table_model.setItem(get_row(i), get_col(j), item) + + last_row = self._n_leading_rows + self._n_classesv * self._n_agg_func + last_col = self._n_leading_cols + self._n_classesh + set_total_item(table_total_v, lambda x: x + self._n_leading_rows, + lambda x: last_col) + set_total_item(table_total_h, lambda x: x + last_row, + lambda x: x + self._n_leading_cols) + set_total_item(table_total, lambda x: x + last_row, lambda x: last_col) + + def _create_header_item(self, text): + bold_font = self.table_model.invisibleRootItem().font() + bold_font.setBold(True) + item = QStandardItem() + item.setData(text, Qt.DisplayRole) + item.setFont(bold_font) + item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter) + item.setFlags(Qt.ItemIsEnabled) + return item + + @staticmethod + def _create_value_item(text): + item = QStandardItem() + item.setData(text, Qt.DisplayRole) + item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter) + item.setFlags(Qt.ItemIsEnabled | Qt.ItemIsSelectable) + return item + + def _draw_lines(self): + end_col = self._n_leading_cols + self._n_classesh + 1 + total_row = self._n_leading_rows + self._n_classesv * self._n_agg_func + indices = [(total_row, j) for j in range(1, end_col)] + for i in range(self._n_classesv): + inner_row = self._n_agg_func * i + self._n_leading_rows + inner_indices = [(inner_row, j) for j in range(1, end_col)] + indices = indices + inner_indices + if not self.add_agg_column: + break + for i, j in indices: + item = self.table_model.item(i, j) + item.setData("t", BorderRole) + item.setData(QColor(160, 160, 160), BorderColorRole) + + def _resize(self, table): + labels = [a.name for a in table.domain[1:]] + [self.TOTAL_STRING] + if len(' '.join(labels)) < 120: + self.horizontalHeader().setSectionResizeMode( + QHeaderView.ResizeToContents) + else: + self.horizontalHeader().setDefaultSectionSize(60) + + def get_selection(self) -> Set: + m, n = self._n_leading_rows, self._n_leading_cols + return {(ind.row() - m, ind.column() - n) + for ind in self.selectedIndexes()} + + def set_selection(self, indexes: Set): + selection = QItemSelection() + index = self.model().index + for row, col in indexes: + sel = index(row + self._n_leading_rows, col + self._n_leading_cols) + selection.select(sel, sel) + self.selectionModel().select( + selection, QItemSelectionModel.ClearAndSelect) + + def clear(self): + self.table_model.clear() + + +class OWPivot(OWWidget): + name = "Pivot Table" + description = "Reshape data table based on column values." + icon = "icons/Pivot.svg" + priority = 1000 + keywords = ["pivot", "group", "aggregate"] + + class Inputs: + data = Input("Data", Table, default=True) + + class Outputs: + pivot_table = Output("Pivot Table", Table, default=True) + filtered_data = Output("Filtered Data", Table) + grouped_data = Output("Grouped Data", Table) + + class Warning(OWWidget.Warning): + # TODO - inconsistent for different variable types + no_col_feature = Msg("Column feature should be selected.") + cannot_aggregate = Msg("Some aggregations ({}) cannot be performed.") + + settingsHandler = DomainContextHandler() + row_feature = ContextSetting(None) + col_feature = ContextSetting(None) + val_feature = ContextSetting(None) + sel_agg_functions = Setting(set([Pivot.Count])) + selection = ContextSetting(set()) + auto_commit = Setting(True) + + AGGREGATIONS = (Pivot.Count, + Pivot.Count_defined, + None, + Pivot.Sum, + Pivot.Mean, + Pivot.Mode, + Pivot.Min, + Pivot.Max, + Pivot.Median, + Pivot.Var, + None, + Pivot.Majority) + + def __init__(self): + super().__init__() + self.data = None # type: Table + self.pivot = None # type: Pivot + self._add_control_area_controls() + self._add_main_area_controls() + + def _add_control_area_controls(self): + box = gui.vBox(self.controlArea, "Rows") + gui.comboBox(box, self, "row_feature", contentsLength=12, + model=DomainModel(valid_types=DomainModel.PRIMITIVE), + callback=self.__feature_changed) + box = gui.vBox(self.controlArea, "Columns") + gui.comboBox(box, self, "col_feature", contentsLength=12, + model=DomainModel(placeholder="(Same as rows)", + valid_types=DiscreteVariable), + callback=self.__feature_changed) + box = gui.vBox(self.controlArea, "Values") + gui.comboBox(box, self, "val_feature", contentsLength=12, + model=DomainModel(placeholder="(None)"), + orientation=Qt.Horizontal, + callback=self.__val_feature_changed) + self.__add_aggregation_controls() + gui.rubber(self.controlArea) + gui.auto_commit(self.controlArea, self, "auto_commit", "&Apply") + + def __add_aggregation_controls(self): + box = gui.vBox(self.controlArea, "Aggregations") + for agg in self.AGGREGATIONS: + if agg is None: + gui.separator(box, height=1) + line = QFrame() + line.setFrameShape(QFrame.HLine) + line.setLineWidth(1) + line.setFrameShadow(QFrame.Sunken) + box.layout().addWidget(line) + continue + check_box = QCheckBox(str(agg), box) + check_box.setChecked(agg in self.sel_agg_functions) + check_box.clicked.connect(lambda *args, a=agg: + self.__aggregation_cb_clicked(a, args[0])) + box.layout().addWidget(check_box) + + def _add_main_area_controls(self): + self.table_view = PivotTableView() + self.table_view.selection_changed.connect(self.__invalidate_filtered) + self.mainArea.layout().addWidget(self.table_view) + + @property + def no_col_feature(self): + return self.col_feature is None and self.row_feature is not None \ + and self.row_feature.is_continuous + + @property + def skipped_aggs(self): + def add(fun): + data, var = self.data, self.val_feature + return data and not var and fun not in Pivot.AutonomousFunctions \ + or var and var.is_discrete and fun in Pivot.ContVarFunctions \ + or var and var.is_continuous and fun in Pivot.DiscVarFunctions + skipped = [str(fun) for fun in self.sel_agg_functions if add(fun)] + return ", ".join(sorted(skipped)) + + def __feature_changed(self): + self.selection = set() + self.pivot = None + self.commit() + + def __val_feature_changed(self): + self.selection = set() + if self.no_col_feature: + return + self.pivot.update_pivot_table(self.val_feature) + self.commit() + + def __aggregation_cb_clicked(self, agg_fun: Pivot.Functions, checked: bool): + self.selection = set() + if checked: + self.sel_agg_functions.add(agg_fun) + else: + self.sel_agg_functions.remove(agg_fun) + if self.no_col_feature or not self.pivot or not self.data: + return + self.pivot.update_group_table(self.sel_agg_functions, self.val_feature) + self.commit() + + def __invalidate_filtered(self): + self.selection = self.table_view.get_selection() + self.commit() + + @Inputs.data + @check_sql_input + def set_data(self, data): + self.closeContext() + self.data = data + self.pivot = None + self.check_data() + self.init_attr_values() + self.openContext(self.data) + self.unconditional_commit() + + def check_data(self): + self.clear_messages() + if not self.data: + self.table_view.clear() + + def init_attr_values(self): + domain = self.data.domain if self.data and len(self.data) else None + for attr in ("row_feature", "col_feature", "val_feature"): + getattr(self.controls, attr).model().set_domain(domain) + setattr(self, attr, None) + model = self.controls.row_feature.model() + if model: + self.row_feature = model[0] + if domain is not None and len(domain.variables): + self.val_feature = self.data.domain.variables[0] + + def commit(self): + if self.pivot is None: + self.Warning.no_col_feature.clear() + if self.no_col_feature: + self.Warning.no_col_feature() + return + self.pivot = Pivot(self.data, self.sel_agg_functions, + self.row_feature, + self.col_feature, self.val_feature) + self.Warning.cannot_aggregate.clear() + if self.skipped_aggs: + self.Warning.cannot_aggregate(self.skipped_aggs) + self._update_graph() + self.Outputs.grouped_data.send(self.pivot.group_table) + self.Outputs.pivot_table.send(self.pivot.pivot_table) + self.Outputs.filtered_data.send(self.get_filtered_data()) + + def _update_graph(self): + self.table_view.clear() + if self.pivot.pivot_table: + col_feature = self.col_feature or self.row_feature + self.table_view.update_table(col_feature.name, + self.row_feature.name, + *self.pivot.pivot_tables) + self.table_view.set_selection(self.selection) + + def get_filtered_data(self): + if not self.data or not self.selection or not self.pivot.pivot_table: + return None + + cond = [] + for i, j in self.selection: + f = [] + for at, val in [(self.row_feature, self.pivot.pivot_table.X[i, 0]), + (self.col_feature, j)]: + if isinstance(at, DiscreteVariable): + f.append(FilterDiscrete(at, [val])) + elif isinstance(at, ContinuousVariable): + f.append(FilterContinuous(at, FilterContinuous.Equal, val)) + cond.append(Values(f)) + return Values([f for f in cond], conjunction=False)(self.data) + + def sizeHint(self): + return QSize(640, 525) + + def send_report(self): + self.report_items(( + ("Row feature", self.row_feature), + ("Column feature", self.col_feature), + ("Value feature", self.val_feature))) + if self.data and self.val_feature is not None: + self.report_table("", self.table_view) + if not self.data: + self.report_items((("Group by", self.row_feature),)) + self.report_table(self.table_view) + + +if __name__ == "__main__": + WidgetPreview(OWPivot).run(set_data=Table("heart_disease")) diff --git a/Orange/widgets/data/tests/test_owpivot.py b/Orange/widgets/data/tests/test_owpivot.py new file mode 100644 index 00000000000..265c0fbe524 --- /dev/null +++ b/Orange/widgets/data/tests/test_owpivot.py @@ -0,0 +1,612 @@ +# Test methods with long descriptive names can omit docstrings +# pylint: disable=missing-docstring +import unittest +from unittest.mock import patch +from pickle import loads, dumps + +import numpy as np + +from AnyQt.QtCore import Qt, QPoint +from AnyQt.QtTest import QTest +from AnyQt.QtWidgets import QCheckBox + +from Orange.data import (Table, Domain, ContinuousVariable as Cv, + StringVariable as sv, DiscreteVariable as Dv) +from Orange.widgets.data.owpivot import (OWPivot, Pivot, + AggregationFunctionsEnum) +from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.tests.utils import simulate + + +class TestOWPivot(WidgetTest): + def setUp(self): + self.widget = self.create_widget(OWPivot) + self.agg_checkboxes = [checkbox for checkbox in + self.widget.controlArea.children()[7].children() + if isinstance(checkbox, QCheckBox)] + self.assertGreater(len(self.agg_checkboxes), 0) + self.iris = Table("iris") + self.heart_disease = Table("heart_disease") + self.zoo = Table("zoo") + + def test_comboboxes(self): + self.send_signal(self.widget.Inputs.data, self.heart_disease) + controls = self.widget.controls + name = self.heart_disease.domain.class_var.name + self.assertEqual(controls.row_feature.currentText(), name) + self.assertEqual(controls.col_feature.currentText(), "(Same as rows)") + self.assertEqual(controls.val_feature.currentText(), "age") + + self.assertEqual(len(controls.row_feature.model()), 15) + self.assertEqual(len(controls.col_feature.model()), 11) + self.assertEqual(len(controls.val_feature.model()), 17) + + domain = self.heart_disease.domain + for var in domain.variables + domain.metas: + self.assertIn(var, controls.val_feature.model()) + if var.is_continuous: + self.assertIn(var, controls.row_feature.model()) + self.assertNotIn(var, controls.col_feature.model()) + elif var.is_discrete: + self.assertIn(var, controls.row_feature.model()) + self.assertIn(var, controls.col_feature.model()) + + def test_feature_combinations(self): + for cb in self.agg_checkboxes[1:]: + cb.click() + self.send_signal(self.widget.Inputs.data, self.iris) + simulate.combobox_run_through_all(self.widget.controls.row_feature) + simulate.combobox_run_through_all(self.widget.controls.col_feature) + simulate.combobox_run_through_all(self.widget.controls.val_feature) + + def test_output_grouped_data(self): + self.send_signal(self.widget.Inputs.data, self.iris) + self.agg_checkboxes[Pivot.Sum.value].click() + grouped = self.get_output(self.widget.Outputs.grouped_data) + names = ["iris", "(count)", "sepal length (sum)", "sepal width (sum)", + "petal length (sum)", "petal width (sum)"] + self.assertListEqual(names, [a.name for a in grouped.domain.variables]) + self.send_signal(self.widget.Inputs.data, None) + self.assertIsNone(self.get_output(self.widget.Outputs.grouped_data)) + + def test_output_filtered_data(self): + self.agg_checkboxes[Pivot.Functions.Sum.value].click() + self.send_signal(self.widget.Inputs.data, self.iris) + simulate.combobox_activate_item(self.widget.controls.row_feature, + self.iris.domain.attributes[0].name) + simulate.combobox_activate_item(self.widget.controls.col_feature, + self.iris.domain.class_var.name) + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.attributes[1].name) + self.assertIsNone(self.get_output(self.widget.Outputs.filtered_data)) + + self.widget.table_view.set_selection(set([(11, 0), (11, 1), (12, 0), + (12, 1), (13, 0), (13, 1), + (14, 0), (14, 1)])) + self.widget.table_view.selection_changed.emit() + output = self.get_output(self.widget.Outputs.filtered_data) + self.assertEqual(output.X.shape, (20, 4)) + self.send_signal(self.widget.Inputs.data, None) + self.assertIsNone(self.get_output(self.widget.Outputs.filtered_data)) + + def test_output_pivot_table(self): + self.send_signal(self.widget.Inputs.data, self.iris) + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.attributes[0].name) + table = self.get_output(self.widget.Outputs.pivot_table) + names = ["iris", "Aggregate", "Iris-setosa", + "Iris-versicolor", "Iris-virginica"] + self.assertListEqual(names, [a.name for a in table.domain.variables]) + self.send_signal(self.widget.Inputs.data, None) + self.assertIsNone(self.get_output(self.widget.Outputs.pivot_table)) + + def test_pivot_table_cont_row(self): + for cb in self.agg_checkboxes[1:]: + cb.click() + self.send_signal(self.widget.Inputs.data, self.iris) + self.assertTrue(self.widget.Warning.cannot_aggregate.is_shown()) + simulate.combobox_activate_item(self.widget.controls.row_feature, + self.iris.domain.attributes[0].name) + self.assertTrue(self.widget.Warning.no_col_feature.is_shown()) + simulate.combobox_activate_item(self.widget.controls.col_feature, + self.iris.domain.class_var.name) + self.assertFalse(self.widget.Warning.no_col_feature.is_shown()) + + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.attributes[1].name) + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.class_var.name) + + def test_pivot_table_disc_row(self): + for cb in self.agg_checkboxes[1:]: + cb.click() + self.send_signal(self.widget.Inputs.data, self.iris) + self.assertTrue(self.widget.Warning.cannot_aggregate.is_shown()) + simulate.combobox_activate_item(self.widget.controls.col_feature, + self.iris.domain.class_var.name) + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.attributes[1].name) + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.class_var.name) + + self.send_signal(self.widget.Inputs.data, self.zoo) + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.zoo.domain.metas[0].name) + simulate.combobox_activate_item(self.widget.controls.col_feature, + self.zoo.domain.attributes[0].name) + + def test_aggregations(self): + # agg: Count, feature: Continuous + self.send_signal(self.widget.Inputs.data, self.iris) + self.assertFalse(self.widget.Warning.cannot_aggregate.is_shown()) + # agg: Count, Sum, feature: Continuous + self.agg_checkboxes[Pivot.Sum.value].click() + self.assertFalse(self.widget.Warning.cannot_aggregate.is_shown()) + # agg: Count, Sum, Majority, feature: Continuous + self.agg_checkboxes[Pivot.Majority.value].click() + self.assertTrue(self.widget.Warning.cannot_aggregate.is_shown()) + # agg: Count, Majority, feature: Continuous + self.agg_checkboxes[Pivot.Sum.value].click() + self.assertTrue(self.widget.Warning.cannot_aggregate.is_shown()) + # agg: Count, Majority, feature: Discrete + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.class_var.name) + self.assertFalse(self.widget.Warning.cannot_aggregate.is_shown()) + # agg: Count, Majority, feature: None + simulate.combobox_activate_item(self.widget.controls.val_feature, + "(None)") + self.assertTrue(self.widget.Warning.cannot_aggregate.is_shown()) + simulate.combobox_activate_item(self.widget.controls.row_feature, + self.iris.domain.attributes[1].name) + self.assertTrue(self.widget.Warning.cannot_aggregate.is_shown()) + self.send_signal(self.widget.Inputs.data, None) + self.assertFalse(self.widget.Warning.cannot_aggregate.is_shown()) + + @patch("Orange.widgets.data.owpivot.Pivot._initialize", + return_value=(None, None)) + def test_group_table_created_once(self, initialize): + self.send_signal(self.widget.Inputs.data, self.iris) + simulate.combobox_activate_item(self.widget.controls.row_feature, + self.iris.domain.attributes[0].name) + simulate.combobox_activate_item(self.widget.controls.col_feature, + self.iris.domain.class_var.name) + initialize.assert_called_with(set([Pivot.Count]), + self.iris.domain.attributes[0]) + initialize.reset_mock() + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.attributes[1].name) + initialize.assert_not_called() + + def test_saved_workflow(self): + self.agg_checkboxes[Pivot.Functions.Sum.value].click() + self.send_signal(self.widget.Inputs.data, self.iris) + simulate.combobox_activate_item(self.widget.controls.row_feature, + self.iris.domain.attributes[0].name) + simulate.combobox_activate_item(self.widget.controls.col_feature, + self.iris.domain.class_var.name) + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.iris.domain.attributes[1].name) + self.widget.table_view.set_selection(set([(11, 0), (11, 1), (12, 0), + (12, 1), (13, 0), (13, 1), + (14, 0), (14, 1)])) + self.widget.table_view.selection_changed.emit() + output = self.get_output(self.widget.Outputs.filtered_data) + self.assertEqual(output.X.shape, (20, 4)) + + settings = self.widget.settingsHandler.pack_data(self.widget) + w = self.create_widget(self.widget.__class__, stored_settings=settings) + self.send_signal(w.Inputs.data, self.iris, widget=w) + output = self.get_output(self.widget.Outputs.filtered_data) + self.assertEqual(output.X.shape, (20, 4)) + self.assertSetEqual(self.widget.selection, w.selection) + + def test_select_by_click(self): + view = self.widget.table_view + self.send_signal(self.widget.Inputs.data, self.heart_disease) + self.agg_checkboxes[Pivot.Functions.Sum.value].click() + simulate.combobox_activate_item(self.widget.controls.val_feature, + self.heart_disease.domain[0].name) + + # column in a group + QTest.mouseClick(view.viewport(), Qt.LeftButton, pos=QPoint(208, 154)) + self.assertSetEqual({(3, 0), (2, 0)}, view.get_selection()) + + # column + QTest.mouseClick(view.viewport(), Qt.LeftButton, pos=QPoint(340, 40)) + self.assertSetEqual({(0, 1), (3, 1), (1, 1), (2, 1)}, + view.get_selection()) + + # group + QTest.mouseClick(view.viewport(), Qt.LeftButton, pos=QPoint(155, 75)) + self.assertSetEqual({(0, 1), (1, 0), (0, 0), (1, 1)}, + view.get_selection()) + + # all + QTest.mouseClick(view.viewport(), Qt.LeftButton, pos=QPoint(400, 198)) + self.assertSetEqual({(0, 1), (0, 0), (3, 0), (3, 1), (2, 1), (2, 0), + (1, 0), (1, 1)}, view.get_selection()) + + def test_send_report(self): + self.send_signal(self.widget.Inputs.data, self.iris) + self.widget.report_button.click() + self.send_signal(self.widget.Inputs.data, None) + self.widget.report_button.click() + + +class TestAggregationFunctionsEnum(unittest.TestCase): + def test_pickle(self): + self.assertIs(AggregationFunctionsEnum.Sum, + loads(dumps(AggregationFunctionsEnum.Sum))) + + def test_sort(self): + af = AggregationFunctionsEnum + self.assertEqual(sorted([af.Sum, af.Min]), sorted([af.Min, af.Sum])) + + +class TestPivot(unittest.TestCase): + @classmethod + def setUpClass(cls): + domain = Domain([Dv("d1", ("a", "b")), + Dv("d2", ("c", "d", "e")), Cv("c1")]) + X = np.array([[0, 0, 1], [0, 1, 2], [0, np.nan, 3], [0, 0, 4], + [1, 0, 5], [1, 0, 6], [1, 1, np.nan], [1, 2, 7], + [np.nan, 0, 8]]) + cls.table = Table(domain, X) + + domain = Domain([Cv("c0"), Dv("d1", ("a", "b")), Cv("c1"), + Dv("d2", ("a", "b")), Cv("c2")], + Dv("cls", ("a", "b")), [sv("m1"), sv("m2")]) + X = np.array([[np.nan, 0, 1, 0, 2], [np.nan, 1, 2, np.nan, 3], + [np.nan, 0, 3, 1, np.nan]]) + M = np.array([["aa", "dd"], ["bb", "ee"], ["cc", ""]], dtype=object) + cls.table1 = Table(domain, X, np.array([0, 0, 1]), M) + + def test_group_table(self): + domain = self.table.domain + pivot = Pivot(self.table, Pivot.Functions, domain[0], domain[1]) + group_tab = pivot.group_table + atts = (Cv("(count)"), Cv("d1 (count defined)"), + Dv("d1 (majority)", ["a", "b"]), + Cv("d2 (count defined)"), Dv("d2 (majority)", ["c", "d", "e"]), + Cv("c1 (count defined)"), Cv("c1 (sum)"), + Cv("c1 (mean)"), Cv("c1 (min)"), Cv("c1 (max)"), + Cv("c1 (mode)"), Cv("c1 (median)"), Cv("c1 (var)")) + X = np.array( + [[0, 0, 2, 2, 0, 2, 0, 2, 5, 2.5, 1, 4, 1, 2.5, 2.25], + [0, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0], + [0, 2, 0, 0, np.nan, 0, np.nan, 0, 0, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], + [1, 0, 2, 2, 1, 2, 0, 2, 11, 5.5, 5, 6, 5, 5.5, 0.25], + [1, 1, 1, 1, 1, 1, 1, 0, 0, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan], + [1, 2, 1, 1, 1, 1, 2, 1, 7, 7, 7, 7, 7, 7, 0]]) + self.assert_table_equal(group_tab, Table(Domain(domain[:2] + atts), X)) + + def test_group_table_metas(self): + domain = Domain([Dv("d1", ("a", "b")), Cv("c1"), + Dv("d2", ("a", "b")), Cv("c2")]) + X = np.array([[0, 1, 0, 2], [1, 2, np.nan, 3], [0, 3, 1, np.nan]]) + table = Table(domain, X).transform( + Domain(domain.attributes[:2], metas=domain.attributes[2:])) + table.metas = table.metas.astype(object) + + pivot = Pivot(table, Pivot.Functions, table.domain[-1]) + group_tab = pivot.group_table + atts = (table.domain[-1], Cv("(count)"), Cv("d1 (count defined)"), + Dv("d1 (majority)", ["a", "b"]), + Cv("c1 (count defined)"), Cv("c1 (sum)"), Cv("c1 (mean)"), + Cv("c1 (min)"), Cv("c1 (max)"), Cv("c1 (mode)"), + Cv("c1 (median)"), Cv("c1 (var)"), Cv("d2 (count defined)"), + Dv("d2 (majority)", ["a", "b"]), Cv("c2 (count defined)"), + Cv("c2 (sum)"), Cv("c2 (mean)"), Cv("c2 (min)"), Cv("c2 (max)"), + Cv("c2 (mode)"), Cv("c2 (median)"), Cv("c2 (var)")) + X = np.array([[0, 1, 1, 0, 1, 1, 1, 1, 1, 1, + 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 0], + [1, 1, 1, 0, 1, 3, 3, 3, 3, 3, 3, 0, 1, 1, 0, 0, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan]], dtype=float) + self.assert_table_equal(group_tab, Table(Domain(atts), X)) + + @patch("Orange.widgets.data.owpivot.Pivot.Count.func", + side_effect=Pivot.Count.func) + @patch("Orange.widgets.data.owpivot.Pivot.Sum.func", + side_effect=Pivot.Sum.func) + def test_group_table_use_cached(self, count_func, sum_func): + domain = self.table.domain + pivot = Pivot(self.table, [Pivot.Count, Pivot.Sum], domain[0], domain[1]) + group_tab = pivot.group_table + count_func.reset_mock() + sum_func.reset_mock() + + pivot.update_group_table(Pivot.Functions) + count_func.assert_not_called() + sum_func.assert_not_called() + atts = (Cv("(count)"), Cv("d1 (count defined)"), + Dv("d1 (majority)", ["a", "b"]), + Cv("d2 (count defined)"), Dv("d2 (majority)", ["c", "d", "e"]), + Cv("c1 (count defined)"), Cv("c1 (sum)"), Cv("c1 (mean)"), + Cv("c1 (min)"), Cv("c1 (max)"), Cv("c1 (mode)"), + Cv("c1 (median)"), Cv("c1 (var)")) + X = np.array( + [[0, 0, 2, 2, 0, 2, 0, 2, 5, 2.5, 1, 4, 1, 2.5, 2.25], + [0, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0], + [0, 2, 0, 0, np.nan, 0, np.nan, 0, 0, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], + [1, 0, 2, 2, 1, 2, 0, 2, 11, 5.5, 5, 6, 5, 5.5, 0.25], + [1, 1, 1, 1, 1, 1, 1, 0, 0, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan], + [1, 2, 1, 1, 1, 1, 2, 1, 7, 7, 7, 7, 7, 7, 0]]) + self.assert_table_equal(pivot.group_table, + Table(Domain(domain[:2] + atts), X)) + + pivot.update_group_table([Pivot.Count, Pivot.Sum]) + count_func.assert_not_called() + sum_func.assert_not_called() + self.assert_table_equal(pivot.group_table, group_tab) + + def test_group_table_no_col_var(self): + domain = self.table.domain + pivot = Pivot(self.table, Pivot.Functions, domain[0]) + group_tab = pivot.group_table + atts = (Cv("(count)"), Cv("d1 (count defined)"), + Dv("d1 (majority)", ["a", "b"]), + Cv("d2 (count defined)"), Dv("d2 (majority)", ["c", "d", "e"]), + Cv("c1 (count defined)"), Cv("c1 (sum)"), + Cv("c1 (mean)"), Cv("c1 (min)"), Cv("c1 (max)"), + Cv("c1 (mode)"), Cv("c1 (median)"), Cv("c1 (var)")) + domain = Domain(domain[:1] + atts) + X = np.array([[0, 4, 4, 0, 3, 0, 4, 10, 2.5, 1, 4, 1, 2.5, 1.25], + [1, 4, 4, 1, 4, 0, 3, 18, 6, 5, 7, 5, 6, 2 / 3]], + dtype=float) + self.assert_table_equal(group_tab, Table(Domain(domain[:1] + atts), X)) + + pivot = Pivot(self.table, Pivot.Functions, domain[0], domain[0]) + group_tab_same_vars = pivot.group_table + self.assert_table_equal(group_tab, group_tab_same_vars) + + def test_group_table_no_col_var_metas(self): + for var in self.table1.domain.metas: + self.assertRaises(TypeError, Pivot, self.table1, var) + + domain = Domain([Dv("d1", ("a", "b")), Cv("c1"), + Dv("d2", ("a", "b")), Cv("c2")]) + X = np.array([[0, 1, 0, 2], [1, 2, np.nan, 3], [0, 3, 1, np.nan]]) + table = Table(domain, X).transform( + Domain(domain.attributes[:2], metas=domain.attributes[2:])) + + pivot = Pivot(table, Pivot.Functions, table.domain[-1]) + group_tab = pivot.group_table + atts = (table.domain[-1], Cv("(count)"), Cv("d1 (count defined)"), + Dv("d1 (majority)", ["a", "b"]), + Cv("c1 (count defined)"), Cv("c1 (sum)"), Cv("c1 (mean)"), + Cv("c1 (min)"), Cv("c1 (max)"), Cv("c1 (mode)"), + Cv("c1 (median)"), Cv("c1 (var)"), Cv("d2 (count defined)"), + Dv("d2 (majority)", ["a", "b"]), Cv("c2 (count defined)"), + Cv("c2 (sum)"), Cv("c2 (mean)"), Cv("c2 (min)"), Cv("c2 (max)"), + Cv("c2 (mode)"), Cv("c2 (median)"), Cv("c2 (var)")) + X = np.array([[0, 1, 1, 0, 1, 1, 1, 1, 1, 1, + 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 0], + [1, 1, 1, 0, 1, 3, 3, 3, 3, 3, 3, 0, 1, 1, 0, 0, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan]], dtype=float) + self.assert_table_equal(group_tab, Table(Domain(atts), X)) + + def test_group_table_update(self): + domain = self.table.domain + atts = (Cv("(count)"), Cv("d1 (count defined)"), + Dv("d1 (majority)", ["a", "b"]), + Cv("d2 (count defined)"), Dv("d2 (majority)", ["c", "d", "e"]), + Cv("c1 (count defined)"), Cv("c1 (sum)"), Cv("c1 (mean)"), + Cv("c1 (min)"), Cv("c1 (max)"), Cv("c1 (mode)"), + Cv("c1 (median)"), Cv("c1 (var)")) + X = np.array( + [[0, 0, 2, 2, 0, 2, 0, 2, 5, 2.5, 1, 4, 1, 2.5, 2.25], + [0, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0], + [0, 2, 0, 0, np.nan, 0, np.nan, 0, 0, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], + [1, 0, 2, 2, 1, 2, 0, 2, 11, 5.5, 5, 6, 5, 5.5, 0.25], + [1, 1, 1, 1, 1, 1, 1, 0, 0, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan], + [1, 2, 1, 1, 1, 1, 2, 1, 7, 7, 7, 7, 7, 7, 0]]) + table = Table(Domain(domain[:2] + atts), X) + + agg = [Pivot.Functions.Count, Pivot.Functions.Sum] + pivot = Pivot(self.table, agg, domain[0], domain[1]) + group_tab = pivot.group_table + pivot.update_group_table(Pivot.Functions) + self.assert_table_equal(pivot.group_table, table) + pivot.update_group_table(agg) + self.assert_table_equal(group_tab, pivot.group_table) + + def test_group_table_1(self): + var = self.table1.domain.variables[1] + domain = Domain( + [var, Cv("(count)"), Cv("c0 (count defined)"), Cv("c0 (sum)"), + Cv("c0 (mean)"), Cv("c0 (min)"), Cv("c0 (max)"), Cv("c0 (mode)"), + Cv("c0 (median)"), Cv("c0 (var)"), Cv("d1 (count defined)"), + Dv("d1 (majority)", ["a", "b"]), Cv("c1 (count defined)"), + Cv("c1 (sum)"), Cv("c1 (mean)"), Cv("c1 (min)"), Cv("c1 (max)"), + Cv("c1 (mode)"), Cv("c1 (median)"), Cv("c1 (var)"), + Cv("d2 (count defined)"), Dv("d2 (majority)", ["a", "b"]), + Cv("c2 (count defined)"), Cv("c2 (sum)"), Cv("c2 (mean)"), + Cv("c2 (min)"), Cv("c2 (max)"), Cv("c2 (mode)"), + Cv("c2 (median)"), Cv("c2 (var)"), Cv("cls (count defined)"), + Dv("cls (majority)", ["a", "b"]), Cv("m1 (count defined)"), + Cv("m2 (count defined)")]) + X = np.array([[0, 2, 0, 0, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan, 2, 0, 2, 4, 2, 1, 3, 1, 2, 1, + 2, 0, 1, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 1], + [1, 1, 0, 0, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, + np.nan, 1, 3, 3, 3, 3, 3, 3, 0, 1, 0, 1, 1]]) + pivot = Pivot(self.table1, Pivot.Functions, var) + group_tab = pivot.group_table + self.assert_table_equal(group_tab, Table(domain, X)) + + def test_group_sparse_data(self): + var = self.table1.domain.variables[1] + dense = Pivot(self.table1, Pivot.Functions, var) + sparse_data = self.table1.to_sparse() + var = sparse_data.domain.variables[1] + sparse = Pivot(sparse_data, Pivot.Functions, var) + self.assert_table_equal(dense.group_table, sparse.group_table) + + def test_pivot(self): + domain = self.table.domain + pivot = Pivot(self.table, Pivot.Functions, domain[0], domain[1], domain[2]) + pivot_tab = pivot.pivot_table + atts = (Dv("Aggregate", ["Count", "Count defined", "Sum", "Mean", + "Min", "Max", "Mode", "Median", "Var"]), + Cv("c"), Cv("d"), Cv("e")) + X = np.array([[0, 0, 2, 1, 0], + [0, 1, 2, 1, 0], + [0, 2, 5, 2, 0], + [0, 3, 2.5, 2, np.nan], + [0, 4, 1, 2, np.nan], + [0, 5, 4, 2, np.nan], + [0, 6, 1, 2, np.nan], + [0, 7, 2.5, 2, np.nan], + [0, 8, 2.25, 0, np.nan], + [1, 0, 2, 1, 1], + [1, 1, 2, 0, 1], + [1, 2, 11, 0, 7], + [1, 3, 5.5, np.nan, 7], + [1, 4, 5, np.nan, 7], + [1, 5, 6, np.nan, 7], + [1, 6, 5, np.nan, 7], + [1, 7, 5.5, np.nan, 7], + [1, 8, 0.25, np.nan, 0]]) + self.assert_table_equal(pivot_tab, Table(Domain(domain[:1] + atts), X)) + + def test_pivot_total(self): + domain = self.table.domain + pivot = Pivot(self.table, [Pivot.Functions.Count, Pivot.Functions.Sum], + domain[0], domain[1], domain[2]) + + atts = (Dv(domain[0].name, ["Total"]), + Dv("Aggregate", ["Count", "Sum"]), Cv("c"), Cv("d"), Cv("e")) + X = np.array([[0, 0, 4, 2, 1], [0, 1, 16, 2, 7]]) + table = Table(Domain(atts), X) + + self.assert_table_equal(pivot.pivot_total_h, table) + table = Table(Domain((Cv("Total"),)), np.array([[3], [7], [4], [18]])) + self.assert_table_equal(pivot.pivot_total_v, table) + + table = Table(Domain((Cv("Total"),)), np.array([[7], [25]])) + self.assert_table_equal(pivot.pivot_total, table) + + def test_pivot_no_col_var(self): + domain = self.table.domain + pivot = Pivot(self.table, Pivot.Functions, domain[0], None, domain[2]) + pivot_tab = pivot.pivot_table + atts = (Dv("Aggregate", + ["Count", "Count defined", "Sum", "Mean", + "Min", "Max", "Mode", "Median", "Var"]), + Cv("a"), Cv("b")) + X = np.array([[0, 0, 4, 0], + [0, 1, 4, 0], + [0, 2, 10, 0], + [0, 3, 2.5, np.nan], + [0, 4, 1, np.nan], + [0, 5, 4, np.nan], + [0, 6, 1, np.nan], + [0, 7, 2.5, np.nan], + [0, 8, 1.25, np.nan], + [1, 0, 0, 4], + [1, 1, 0, 3], + [1, 2, 0, 18], + [1, 3, np.nan, 6], + [1, 4, np.nan, 5], + [1, 5, np.nan, 7], + [1, 6, np.nan, 5], + [1, 7, np.nan, 6], + [1, 8, np.nan, 2 / 3]]) + self.assert_table_equal(pivot_tab, Table(Domain(domain[:1] + atts), X)) + + def test_pivot_no_val_var(self): + domain = self.table.domain + pivot = Pivot(self.table, Pivot.Functions, domain[0], domain[1]) + pivot_tab = pivot.pivot_table + atts = (Dv("Aggregate", ["Count"]), + Cv("c"), Cv("d"), Cv("e")) + X = np.array([[0, 0, 2, 1, 0], [1, 0, 2, 1, 1]]) + self.assert_table_equal(pivot_tab, Table(Domain(domain[:1] + atts), X)) + + def test_pivot_disc_val_var(self): + domain = self.table.domain + pivot = Pivot(self.table, [Pivot.Count_defined, Pivot.Majority], + domain[2], domain[0], domain[1]) + pivot_tab = pivot.pivot_table + atts = (domain[2], Dv("Aggregate", ["Count defined", "Majority"]), + Dv("a", ["0.0", "1.0", "c", "d"]), + Dv("b", ["0.0", "1.0", "c", "e"])) + X = np.array([[1, 0, 1, 0], + [1, 1, 2, np.nan], + [2, 0, 1, 0], + [2, 1, 3, np.nan], + [3, 0, 0, 0], + [3, 1, np.nan, np.nan], + [4, 0, 1, 0], + [4, 1, 2, np.nan], + [5, 0, 0, 1], + [5, 1, np.nan, 2], + [6, 0, 0, 1], + [6, 1, np.nan, 2], + [7, 0, 0, 1], + [7, 1, np.nan, 3], + [8, 0, 0, 0], + [8, 1, np.nan, np.nan]]) + self.assert_table_equal(pivot_tab, Table(Domain(atts), X)) + + def test_pivot_attr_combinations(self): + domain = self.table1.domain + for var1, var2, var3 in ((domain[1], domain[3], domain[5]), # d d d + (domain[1], domain[3], domain[4]), # d d c + (domain[1], domain[3], domain[-1]), # d d s + (domain[2], domain[3], domain[5]), # c d d + (domain[2], domain[3], domain[4]), # c d c + (domain[2], domain[3], domain[-1])): # c d s + pivot = Pivot(self.table1, Pivot.Functions, var1, var2, var3) + pivot_tab = pivot.pivot_table + self.assertGreaterEqual(pivot_tab.X.shape[0], 4) + self.assertGreaterEqual(pivot_tab.X.shape[1], 4) + for var1, var2 in ((domain[1], domain[2]), # d c + (domain[1], domain[-2]), # d s + (domain[2], domain[4]), # c + (domain[-1], domain[1])): # s + self.assertRaises(TypeError, Pivot, self.table1, var1, var2) + + def test_pivot_update(self): + domain = self.table.domain + pivot = Pivot(self.table, [Pivot.Functions.Count], domain[0], + domain[1], domain[2]) + pivot_tab1 = pivot.pivot_table + pivot.update_pivot_table(domain[1]) + pivot.update_pivot_table(domain[2]) + self.assert_table_equal(pivot_tab1, pivot.pivot_table) + + def test_pivot_data_subset(self): + data = Table("iris") + cls_var = data.domain.class_var + pivot = Pivot(data[:100], Pivot.Functions, cls_var, None, cls_var) + atts = (cls_var, Dv("Aggregate", ["Count", "Count defined", "Majority"]), + Dv("Iris-setosa", ["0.0", "50.0", "Iris-setosa"]), + Dv("Iris-versicolor", ["0.0", "50.0", "Iris-versicolor"])) + domain = Domain(atts) + self.assert_domain_equal(domain, pivot.pivot_table.domain) + + def assert_table_equal(self, table1, table2): + self.assert_domain_equal(table1.domain, table2.domain) + np.testing.assert_array_equal(table1.X, table2.X) + np.testing.assert_array_equal(table1.Y, table2.Y) + np.testing.assert_array_equal(table1.metas, table2.metas) + + def assert_domain_equal(self, domain1, domain2): + for var1, var2 in zip(domain1.variables + domain1.metas, + domain2.variables + domain2.metas): + self.assertEqual(type(var1), type(var2)) + self.assertEqual(var1.name, var2.name) + if var1.is_discrete: + self.assertEqual(var1.values, var2.values) + + +if __name__ == "__main__": + unittest.main()