Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Group By - fix std and sum for TimeVariable #6133

Merged
merged 1 commit into from
Sep 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 52 additions & 7 deletions Orange/widgets/data/owgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from orangewidget.utils.signals import Input, Output
from orangewidget.utils import enum_as_int
from orangewidget.widget import Msg
from pandas.core.dtypes.common import is_datetime64_any_dtype

from Orange.data import (
ContinuousVariable,
Expand Down Expand Up @@ -56,26 +57,59 @@ def concatenate(x):
return " ".join(str(v) for v in x if not pd.isnull(v) and len(str(v)) > 0)


def std(s):
"""
Std that also handle time variable. Pandas's std return Timedelta object in
case of datetime columns - transform TimeDelta to seconds
"""
std_ = s.std()
if isinstance(std_, pd.Timedelta):
return std_.total_seconds()
# std returns NaT when cannot compute value - change it to nan to keep colum numeric
return nan if pd.isna(std_) else std_


def var(s):
"""
Variance that also handle time variable. Pandas's variance function somehow
doesn't support DateTimeArray - this function fist converts datetime series
to UNIX epoch and then computes variance
"""
if is_datetime64_any_dtype(s):
initial_ts = pd.Timestamp("1970-01-01", tz=None if s.dt.tz is None else "UTC")
if s.dt.tz is not None:
s = s.tz_convert("UTC")
s = (s - initial_ts) / pd.Timedelta("1s")
var_ = s.var()
return var_.total_seconds() if isinstance(var_, pd.Timedelta) else var_


def span(s):
"""
Span that also handle time variable. Time substitution return Timedelta
object in case of datetime columns - transform TimeDelta to seconds
"""
span_ = pd.Series.max(s) - pd.Series.min(s)
return span_.total_seconds() if isinstance(span_, pd.Timedelta) else span_


AGGREGATIONS = {
"Mean": Aggregation("mean", {ContinuousVariable, TimeVariable}),
"Median": Aggregation("median", {ContinuousVariable, TimeVariable}),
"Mode": Aggregation(
lambda x: pd.Series.mode(x).get(0, nan),
{ContinuousVariable, DiscreteVariable, TimeVariable}
),
"Standard deviation": Aggregation("std", {ContinuousVariable, TimeVariable}),
"Variance": Aggregation("var", {ContinuousVariable, TimeVariable}),
"Sum": Aggregation("sum", {ContinuousVariable, TimeVariable}),
"Standard deviation": Aggregation(std, {ContinuousVariable, TimeVariable}),
"Variance": Aggregation(var, {ContinuousVariable, TimeVariable}),
"Sum": Aggregation("sum", {ContinuousVariable}),
"Concatenate": Aggregation(
concatenate,
{ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable},
),
"Min. value": Aggregation("min", {ContinuousVariable, TimeVariable}),
"Max. value": Aggregation("max", {ContinuousVariable, TimeVariable}),
"Span": Aggregation(
lambda x: pd.Series.max(x) - pd.Series.min(x),
{ContinuousVariable, TimeVariable},
),
"Span": Aggregation(span, {ContinuousVariable, TimeVariable}),
"First value": Aggregation(
"first", {ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable}
),
Expand Down Expand Up @@ -506,6 +540,17 @@ def __aggregation_compatible(agg, attr):
"""Check a compatibility of aggregation with the variable"""
return type(attr) in AGGREGATIONS[agg].types

@classmethod
def migrate_context(cls, context, _):
"""
Before widget allowed using Sum on Time variable, now it is forbidden.
This function removes Sum from the context for TimeVariables (104)
"""
for var_, v in context.values["aggregations"][0].items():
if len(var_) == 2:
if var_[1] == 104:
v.discard("Sum")


if __name__ == "__main__":
# pylint: disable=ungrouped-imports
Expand Down
1 change: 1 addition & 0 deletions Orange/widgets/data/tests/test_oweditdomain.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,7 @@ def test_raise_pandas_version(self):
When this test start to fail:
- remove this test
- remove if clause in datetime_to_epoch function and supporting comments
- remove same if clause in var function in owgroupby (line 77, 78)
- set pandas dependency version to pandas>=1.4
"""
from datetime import datetime
Expand Down
226 changes: 223 additions & 3 deletions Orange/widgets/data/tests/test_owgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
table_to_frame,
Domain,
ContinuousVariable,
DiscreteVariable,
TimeVariable,
StringVariable,
)
from Orange.data.tests.test_aggregate import create_sample_data
from Orange.widgets.data.owgroupby import OWGroupBy
Expand Down Expand Up @@ -665,6 +668,28 @@ def test_context(self):
self.widget.aggregations,
)

def test_context_time_variable(self):
"""
Test migrate_context which removes sum for TimeVariable since
GroupBy does not support it anymore for TimeVariable
"""
tv = TimeVariable("T", have_time=True, have_date=True)
data = Table.from_numpy(
Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
np.array([[0.0, 0.0], [0, 10], [0, 20], [1, 500], [1, 1000]]),
)
self.send_signal(self.widget.Inputs.data, data)
self.widget.aggregations[tv].add("Sum")
self.widget.aggregations[tv].add("Median")
self.send_signal(self.widget.Inputs.data, self.iris)

widget = self.create_widget(
OWGroupBy,
stored_settings=self.widget.settingsHandler.pack_data(self.widget),
)
self.send_signal(widget.Inputs.data, data, widget=widget)
self.assertSetEqual(widget.aggregations[tv], {"Mean", "Median"})

@patch(
"Orange.data.aggregate.OrangeTableGroupBy.aggregate",
Mock(side_effect=ValueError("Test unexpected err")),
Expand All @@ -690,16 +715,211 @@ def test_time_variable(self):

# time variable as a group by variable
self.send_signal(self.widget.Inputs.data, data)
self._set_selection(self.widget.gb_attrs_view, [1])
self._set_selection(self.widget.gb_attrs_view, [3])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(3, len(output))

# time variable as a grouped variable
self.send_signal(self.widget.Inputs.data, data)
self._set_selection(self.widget.gb_attrs_view, [5])
attributes = [data.domain["c2"], data.domain["d2"]]
self.send_signal(self.widget.Inputs.data, data[:, attributes])
self._set_selection(self.widget.gb_attrs_view, [1]) # d2
# check all aggregations
self.assert_aggregations_equal(["Mean", "Mode"])
self.select_table_rows(self.widget.agg_table_view, [0]) # c2
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["Mean, Median, Mode and 12 more", "Mode"])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(2, len(output))

def test_time_variable_results(self):
data = Table.from_numpy(
Domain(
[
DiscreteVariable("G", values=["G1", "G2", "G3"]),
TimeVariable("T", have_time=True, have_date=True),
]
),
np.array([[0.0, 0], [0, 10], [0, 20], [1, 500], [1, 1000], [2, 1]]),
)
self.send_signal(self.widget.Inputs.data, data)

# disable aggregating G
self.select_table_rows(self.widget.agg_table_view, [0]) # T
self.widget.agg_checkboxes["Mode"].click()
# select all possible aggregations for T
self.select_table_rows(self.widget.agg_table_view, [1]) # T
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])

expected_df = pd.DataFrame(
{
"T - Mean": [
"1970-01-01 00:00:10",
"1970-01-01 00:12:30",
"1970-01-01 00:00:01",
],
"T - Median": [
"1970-01-01 00:00:10",
"1970-01-01 00:12:30",
"1970-01-01 00:00:01",
],
"T - Mode": [
"1970-01-01 00:00:00",
"1970-01-01 00:08:20",
"1970-01-01 00:00:01",
],
"T - Standard deviation": [10, 353.5533905932738, np.nan],
"T - Variance": [100, 125000, np.nan],
"T - Min. value": [
"1970-01-01 00:00:00",
"1970-01-01 00:08:20",
"1970-01-01 00:00:01",
],
"T - Max. value": [
"1970-01-01 00:00:20",
"1970-01-01 00:16:40",
"1970-01-01 00:00:01",
],
"T - Span": [20, 500, 0],
"T - First value": [
"1970-01-01 00:00:00",
"1970-01-01 00:08:20",
"1970-01-01 00:00:01",
],
"T - Last value": [
"1970-01-01 00:00:20",
"1970-01-01 00:16:40",
"1970-01-01 00:00:01",
],
"T - Count defined": [3, 2, 1],
"T - Count": [3, 2, 1],
"T - Proportion defined": [1, 1, 1],
"T - Concatenate": [
"1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
"1970-01-01 00:08:20 1970-01-01 00:16:40",
"1970-01-01 00:00:01",
],
"G": ["G1", "G2", "G3"],
}
)
df_col = [
"T - Mean",
"T - Median",
"T - Mode",
"T - Min. value",
"T - Max. value",
"T - First value",
"T - Last value",
]
expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
output = self.get_output(self.widget.Outputs.data)
output_df = table_to_frame(output, include_metas=True)
# remove random since it is not possible to test
output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]

pd.testing.assert_frame_equal(
output_df,
expected_df,
check_dtype=False,
check_column_type=False,
check_categorical=False,
atol=1e-3,
)
expected_attributes = (
TimeVariable("T - Mean", have_date=1, have_time=1),
TimeVariable("T - Median", have_date=1, have_time=1),
TimeVariable("T - Mode", have_date=1, have_time=1),
ContinuousVariable(name="T - Standard deviation"),
ContinuousVariable(name="T - Variance"),
TimeVariable("T - Min. value", have_date=1, have_time=1),
TimeVariable("T - Max. value", have_date=1, have_time=1),
ContinuousVariable(name="T - Span"),
TimeVariable("T - First value", have_date=1, have_time=1),
TimeVariable("T - Last value", have_date=1, have_time=1),
TimeVariable("T - Random value", have_date=1, have_time=1),
ContinuousVariable(name="T - Count defined"),
ContinuousVariable(name="T - Count"),
ContinuousVariable(name="T - Proportion defined"),
)
expected_metas = (
StringVariable(name="T - Concatenate"),
DiscreteVariable(name="G", values=("G1", "G2", "G3")),
)
self.assertTupleEqual(output.domain.attributes, expected_attributes)
self.assertTupleEqual(output.domain.metas, expected_metas)

def test_tz_time_variable_results(self):
""" Test results in case of timezoned time variable"""
tv = TimeVariable("T", have_time=True, have_date=True)
data = Table.from_numpy(
Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
np.array([[0.0, tv.parse("1970-01-01 01:00:00+01:00")],
[0, tv.parse("1970-01-01 01:00:10+01:00")],
[0, tv.parse("1970-01-01 01:00:20+01:00")]]),
)

self.send_signal(self.widget.Inputs.data, data)

# disable aggregating G
self.select_table_rows(self.widget.agg_table_view, [0]) # T
self.widget.agg_checkboxes["Mode"].click()
# select all possible aggregations for T
self.select_table_rows(self.widget.agg_table_view, [1]) # T
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])

expected_df = pd.DataFrame(
{
"T - Mean": ["1970-01-01 00:00:10"],
"T - Median": ["1970-01-01 00:00:10"],
"T - Mode": ["1970-01-01 00:00:00"],
"T - Standard deviation": [10],
"T - Variance": [100],
"T - Min. value": ["1970-01-01 00:00:00"],
"T - Max. value": ["1970-01-01 00:00:20"],
"T - Span": [20, ],
"T - First value": ["1970-01-01 00:00:00"],
"T - Last value": ["1970-01-01 00:00:20"],
"T - Count defined": [3],
"T - Count": [3],
"T - Proportion defined": [1],
"T - Concatenate": [
"1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
],
"G": ["G1"],
}
)
df_col = [
"T - Mean",
"T - Median",
"T - Mode",
"T - Min. value",
"T - Max. value",
"T - First value",
"T - Last value",
]
expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
output_df = table_to_frame(
self.get_output(self.widget.Outputs.data), include_metas=True
)
# remove random since it is not possible to test
output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]

pd.testing.assert_frame_equal(
output_df,
expected_df,
check_dtype=False,
check_column_type=False,
check_categorical=False,
atol=1e-3,
)

def test_only_nan_in_group(self):
data = Table(
Domain([ContinuousVariable("A"), ContinuousVariable("B")]),
Expand Down