Skip to content

Commit

Permalink
Group By - fix failing/wrong statistics on datetime data
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Sep 30, 2022
1 parent 2979b21 commit a6c4701
Show file tree
Hide file tree
Showing 3 changed files with 189 additions and 10 deletions.
45 changes: 38 additions & 7 deletions Orange/widgets/data/owgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from orangewidget.utils.signals import Input, Output
from orangewidget.utils import enum_as_int
from orangewidget.widget import Msg
from pandas.core.dtypes.common import is_datetime64_any_dtype

from Orange.data import (
ContinuousVariable,
Expand Down Expand Up @@ -56,26 +57,56 @@ def concatenate(x):
return " ".join(str(v) for v in x if not pd.isnull(v) and len(str(v)) > 0)


def std(s):
"""
Std that also handle time variable. Pandas's std return Timedelta object in
case of datetime columns - transform TimeDelta to seconds
"""
std_ = s.std()
return std_.total_seconds() if isinstance(std_, pd.Timedelta) else std_


def var(s):
"""
Variance that also handle time variable. Pandas's variance function somehow
doesn't support DateTimeArray - this function fist converts datetime series
to UNIX epoch and then computes variance
"""
if is_datetime64_any_dtype(s):
initial_ts = pd.Timestamp("1970-01-01", tz=None if s.dt.tz is None else "UTC")
if s.dt.tz is not None:
s = s.tz_convert("UTC")
s = (s - initial_ts) / pd.Timedelta("1s")
var_ = s.var()
return var_.total_seconds() if isinstance(var_, pd.Timedelta) else var_


def span(s):
"""
Span that also handle time variable. Time substitution return Timedelta
object in case of datetime columns - transform TimeDelta to seconds
"""
span_ = pd.Series.max(s) - pd.Series.min(s)
return span_.total_seconds() if isinstance(span_, pd.Timedelta) else span_


AGGREGATIONS = {
"Mean": Aggregation("mean", {ContinuousVariable, TimeVariable}),
"Median": Aggregation("median", {ContinuousVariable, TimeVariable}),
"Mode": Aggregation(
lambda x: pd.Series.mode(x).get(0, nan),
{ContinuousVariable, DiscreteVariable, TimeVariable}
),
"Standard deviation": Aggregation("std", {ContinuousVariable, TimeVariable}),
"Variance": Aggregation("var", {ContinuousVariable, TimeVariable}),
"Sum": Aggregation("sum", {ContinuousVariable, TimeVariable}),
"Standard deviation": Aggregation(std, {ContinuousVariable, TimeVariable}),
"Variance": Aggregation(var, {ContinuousVariable, TimeVariable}),
"Sum": Aggregation("sum", {ContinuousVariable}),
"Concatenate": Aggregation(
concatenate,
{ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable},
),
"Min. value": Aggregation("min", {ContinuousVariable, TimeVariable}),
"Max. value": Aggregation("max", {ContinuousVariable, TimeVariable}),
"Span": Aggregation(
lambda x: pd.Series.max(x) - pd.Series.min(x),
{ContinuousVariable, TimeVariable},
),
"Span": Aggregation(span, {ContinuousVariable, TimeVariable}),
"First value": Aggregation(
"first", {ContinuousVariable, DiscreteVariable, StringVariable, TimeVariable}
),
Expand Down
1 change: 1 addition & 0 deletions Orange/widgets/data/tests/test_oweditdomain.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,7 @@ def test_raise_pandas_version(self):
When this test start to fail:
- remove this test
- remove if clause in datetime_to_epoch function and supporting comments
- remove same if clause in var function in owgroupby (line 77, 78)
- set pandas dependency version to pandas>=1.4
"""
from datetime import datetime
Expand Down
153 changes: 150 additions & 3 deletions Orange/widgets/data/tests/test_owgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
table_to_frame,
Domain,
ContinuousVariable,
DiscreteVariable,
TimeVariable,
)
from Orange.data.tests.test_aggregate import create_sample_data
from Orange.widgets.data.owgroupby import OWGroupBy
Expand Down Expand Up @@ -690,16 +692,161 @@ def test_time_variable(self):

# time variable as a group by variable
self.send_signal(self.widget.Inputs.data, data)
self._set_selection(self.widget.gb_attrs_view, [1])
self._set_selection(self.widget.gb_attrs_view, [3])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(3, len(output))

# time variable as a grouped variable
self.send_signal(self.widget.Inputs.data, data)
self._set_selection(self.widget.gb_attrs_view, [5])
attributes = [data.domain["c2"], data.domain["d2"]]
self.send_signal(self.widget.Inputs.data, data[:, attributes])
self._set_selection(self.widget.gb_attrs_view, [1]) # d2
# check all aggregations
self.assert_aggregations_equal(["Mean", "Mode"])
self.select_table_rows(self.widget.agg_table_view, [0]) # c2
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["Mean, Median, Mode and 12 more", "Mode"])
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(2, len(output))

def test_time_variable_results(self):
data = Table.from_numpy(
Domain(
[
DiscreteVariable("G", values=["G1", "G2"]),
TimeVariable("T", have_time=True, have_date=True),
]
),
np.array([[0.0, 0.0], [0, 10], [0, 20], [1, 500], [1, 1000]]),
)
self.send_signal(self.widget.Inputs.data, data)

# disable aggregating G
self.select_table_rows(self.widget.agg_table_view, [0]) # T
self.widget.agg_checkboxes["Mode"].click()
# select all possible aggregations for T
self.select_table_rows(self.widget.agg_table_view, [1]) # T
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])

expected_df = pd.DataFrame(
{
"T - Mean": ["1970-01-01 00:00:10", "1970-01-01 00:12:30"],
"T - Median": ["1970-01-01 00:00:10", "1970-01-01 00:12:30"],
"T - Mode": ["1970-01-01 00:00:00", "1970-01-01 00:08:20"],
"T - Standard deviation": [10, 353.5533905932738],
"T - Variance": [100, 125000],
"T - Min. value": ["1970-01-01 00:00:00", "1970-01-01 00:08:20"],
"T - Max. value": ["1970-01-01 00:00:20", "1970-01-01 00:16:40"],
"T - Span": [20, 500],
"T - First value": ["1970-01-01 00:00:00", "1970-01-01 00:08:20"],
"T - Last value": ["1970-01-01 00:00:20", "1970-01-01 00:16:40"],
"T - Count defined": [3, 2],
"T - Count": [3, 2],
"T - Proportion defined": [1, 1],
"T - Concatenate": [
"1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
"1970-01-01 00:08:20 1970-01-01 00:16:40",
],
"G": ["G1", "G2"],
}
)
df_col = [
"T - Mean",
"T - Median",
"T - Mode",
"T - Min. value",
"T - Max. value",
"T - First value",
"T - Last value",
]
expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
output_df = table_to_frame(
self.get_output(self.widget.Outputs.data), include_metas=True
)
# remove random since it is not possible to test
output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]

pd.testing.assert_frame_equal(
output_df,
expected_df,
check_dtype=False,
check_column_type=False,
check_categorical=False,
atol=1e-3,
)

def test_tz_time_variable_results(self):
""" Test results in case of timezoned time variable"""
tv = TimeVariable("T", have_time=True, have_date=True)
data = Table.from_numpy(
Domain([DiscreteVariable("G", values=["G1", "G2"]), tv]),
np.array([[0.0, tv.parse("1970-01-01 01:00:00+01:00")],
[0, tv.parse("1970-01-01 01:00:10+01:00")],
[0, tv.parse("1970-01-01 01:00:20+01:00")]]),
)

self.send_signal(self.widget.Inputs.data, data)

# disable aggregating G
self.select_table_rows(self.widget.agg_table_view, [0]) # T
self.widget.agg_checkboxes["Mode"].click()
# select all possible aggregations for T
self.select_table_rows(self.widget.agg_table_view, [1]) # T
for cb in self.widget.agg_checkboxes.values():
if cb.text() != "Mean":
cb.click()
self.assert_aggregations_equal(["", "Mean, Median, Mode and 12 more"])

expected_df = pd.DataFrame(
{
"T - Mean": ["1970-01-01 00:00:10"],
"T - Median": ["1970-01-01 00:00:10"],
"T - Mode": ["1970-01-01 00:00:00"],
"T - Standard deviation": [10],
"T - Variance": [100],
"T - Min. value": ["1970-01-01 00:00:00"],
"T - Max. value": ["1970-01-01 00:00:20"],
"T - Span": [20, ],
"T - First value": ["1970-01-01 00:00:00"],
"T - Last value": ["1970-01-01 00:00:20"],
"T - Count defined": [3],
"T - Count": [3],
"T - Proportion defined": [1],
"T - Concatenate": [
"1970-01-01 00:00:00 1970-01-01 00:00:10 1970-01-01 00:00:20",
],
"G": ["G1"],
}
)
df_col = [
"T - Mean",
"T - Median",
"T - Mode",
"T - Min. value",
"T - Max. value",
"T - First value",
"T - Last value",
]
expected_df[df_col] = expected_df[df_col].apply(pd.to_datetime)
output_df = table_to_frame(
self.get_output(self.widget.Outputs.data), include_metas=True
)
# remove random since it is not possible to test
output_df = output_df.loc[:, ~output_df.columns.str.endswith("Random value")]

pd.testing.assert_frame_equal(
output_df,
expected_df,
check_dtype=False,
check_column_type=False,
check_categorical=False,
atol=1e-3,
)

def test_only_nan_in_group(self):
data = Table(
Domain([ContinuousVariable("A"), ContinuousVariable("B")]),
Expand Down

0 comments on commit a6c4701

Please sign in to comment.