From b2c7c75edd9e9ad5b7e1cc5b370402bb47a951dc Mon Sep 17 00:00:00 2001 From: Primoz Godec Date: Fri, 6 Aug 2021 14:09:57 +0200 Subject: [PATCH 1/2] pandas_compat: fix conversion of datetime --- Orange/data/pandas_compat.py | 57 +++++++- Orange/data/tests/test_pandas.py | 244 ++++++++++++++++++++++++++++++- Orange/data/variable.py | 28 +++- 3 files changed, 314 insertions(+), 15 deletions(-) diff --git a/Orange/data/pandas_compat.py b/Orange/data/pandas_compat.py index e05bf9d8fba..7321b6d7f72 100644 --- a/Orange/data/pandas_compat.py +++ b/Orange/data/pandas_compat.py @@ -157,13 +157,56 @@ def _is_datetime(s): return True try: if is_object_dtype(s): - pd.to_datetime(s, infer_datetime_format=True) + # utc=True - to allow different timezones in a series object + pd.to_datetime(s, infer_datetime_format=True, utc=True) return True except Exception: # pylint: disable=broad-except pass return False +def _convert_datetime(series, var): + def col_type(dt): + """Test if is date, time or datetime""" + dt_nonnat = dt[~pd.isnull(dt)] # nat == nat is False + if (dt_nonnat.dt.floor("d") == dt_nonnat).all(): + # all times are 00:00:00.0 - pure date + return 1, 0 + elif (dt_nonnat.dt.date == pd.Timestamp("now").date()).all(): + # all dates are today's date - pure time + return 0, 1 # pure time + else: + # else datetime + return 1, 1 + + try: + dt = pd.to_datetime(series) + except ValueError: + # series with type object and different timezones will raise a + # ValueError - normalizing to utc + dt = pd.to_datetime(series, utc=True) + + # set variable type to date, time or datetime + var.have_date, var.have_time = col_type(dt) + + if dt.dt.tz is not None: + # set timezone if available and convert to utc + var.timezone = dt.dt.tz + dt = dt.dt.tz_convert("UTC") + + if var.have_time and not var.have_date: + # if time only measure seconds from midnight - equal to setting date + # to unix epoch + return ( + (dt.dt.tz_localize(None) - pd.Timestamp("now").normalize()) + / pd.Timedelta("1s") + ).values + + return ( + (dt.dt.tz_localize(None) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s") + ).values + + def vars_from_df(df, role=None, force_nominal=False): if role is None and hasattr(df, 'orange_role'): _role = df.orange_role @@ -210,6 +253,11 @@ def vars_from_df(df, role=None, force_nominal=False): Mcols.append(column) Mexpr.append(None) metas.append(var) + elif _is_datetime(s): + var = TimeVariable(str(column)) + attrs.append(var) + Xcols.append(column) + Xexpr.append(_convert_datetime) elif _is_discrete(s, force_nominal): discrete = s.astype('category').cat var = DiscreteVariable(str(column), @@ -224,13 +272,6 @@ def to_cat(s, _): return np.asarray(x) Xexpr.append(to_cat) - elif _is_datetime(s): - var = TimeVariable(str(column)) - attrs.append(var) - Xcols.append(column) - Xexpr.append(lambda s, v: np.asarray( - s.astype('str').replace('NaT', np.nan).map(v.parse) - )) elif is_numeric_dtype(s): var = ContinuousVariable( # set number of decimals to 0 if int else keeps default behaviour diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py index 1ffc4620794..8975f508ca3 100644 --- a/Orange/data/tests/test_pandas.py +++ b/Orange/data/tests/test_pandas.py @@ -1,13 +1,16 @@ # pylint: disable=import-outside-toplevel - import unittest +from datetime import date, datetime, timezone + import numpy as np +import pytz from scipy.sparse import csr_matrix import scipy.sparse as sp from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable, Table, Domain, \ StringVariable from Orange.data.pandas_compat import OrangeDataFrame +from Orange.data.tests.test_variable import TestTimeVariable try: import pandas as pd @@ -164,6 +167,245 @@ def test_not_orangedf(self): for v1, v2 in zip(vars1, vars2): self.assertEqual(type(v1), type(v2)) + def test_table_from_frame_date(self): + from Orange.data.pandas_compat import table_from_frame + + df = pd.DataFrame( + [[pd.Timestamp("2017-12-19")], [pd.Timestamp("1724-12-20")], [np.nan]] + ) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19").timestamp()], + [pd.Timestamp("1724-12-20").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 0) + self.assertEqual(table.domain.variables[0].have_date, 1) + + df = pd.DataFrame([["2017-12-19"], ["1724-12-20"], [np.nan]]) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19").timestamp()], + [pd.Timestamp("1724-12-20").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 0) + self.assertEqual(table.domain.variables[0].have_date, 1) + + df = pd.DataFrame([[date(2017, 12, 19)], [date(1724, 12, 20)], [np.nan]]) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19").timestamp()], + [pd.Timestamp("1724-12-20").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 0) + self.assertEqual(table.domain.variables[0].have_date, 1) + + def test_table_from_frame_time(self): + from Orange.data.pandas_compat import table_from_frame + + df = pd.DataFrame( + [[pd.Timestamp("00:00:00.25")], [pd.Timestamp("20:20:20.30")], [np.nan]] + ) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("1970-01-01 00:00:00.25").timestamp()], + [pd.Timestamp("1970-01-01 20:20:20.30").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 1) + self.assertEqual(table.domain.variables[0].have_date, 0) + + df = pd.DataFrame([["00:00:00.25"], ["20:20:20.30"], [np.nan]]) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("1970-01-01 00:00:00.25").timestamp()], + [pd.Timestamp("1970-01-01 20:20:20.30").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 1) + self.assertEqual(table.domain.variables[0].have_date, 0) + + def test_table_from_frame_datetime(self): + from Orange.data.pandas_compat import table_from_frame + + df = pd.DataFrame( + [ + [pd.Timestamp("2017-12-19 00:00:00.50")], + [pd.Timestamp("1724-12-20 20:20:20.30")], + [np.nan], + ] + ) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()], + [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 1) + self.assertEqual(table.domain.variables[0].have_date, 1) + + df = pd.DataFrame( + [["2017-12-19 00:00:00.50"], ["1724-12-20 20:20:20.30"], [np.nan]] + ) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()], + [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 1) + self.assertEqual(table.domain.variables[0].have_date, 1) + + df = pd.DataFrame( + [ + [datetime(2017, 12, 19, 0, 0, 0, 500000)], + [datetime(1724, 12, 20, 20, 20, 20, 300000)], + [np.nan], + ] + ) + table = table_from_frame(df) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()], + [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()], + [np.nan], + ], + ) + self.assertEqual(table.domain.variables[0].have_time, 1) + self.assertEqual(table.domain.variables[0].have_date, 1) + + def test_table_from_frame_timezones(self): + from Orange.data.pandas_compat import table_from_frame + + df = pd.DataFrame( + [ + [pd.Timestamp("2017-12-19 00:00:00")], + [pd.Timestamp("1724-12-20 20:20:20")], + [np.nan], + ] + ) + table = table_from_frame(df) + self.assertIsNone(table.domain.variables[0].utc_offset) + self.assertEqual(table.domain.variables[0].timezone, timezone.utc) + + df = pd.DataFrame( + [ + [pd.Timestamp("2017-12-19 00:00:00Z")], + [pd.Timestamp("1724-12-20 20:20:20Z")], + [np.nan], + ] + ) + table = table_from_frame(df) + self.assertEqual(pytz.utc, table.domain.variables[0].timezone) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19 00:00:00").timestamp()], + [pd.Timestamp("1724-12-20 20:20:20").timestamp()], + [np.nan], + ], + ) + + df = pd.DataFrame( + [ + [pd.Timestamp("2017-12-19 00:00:00+1")], + [pd.Timestamp("1724-12-20 20:20:20+1")], + [np.nan], + ] + ) + table = table_from_frame(df) + self.assertEqual(pytz.FixedOffset(60), table.domain.variables[0].timezone) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()], + [pd.Timestamp("1724-12-20 20:20:20+1").timestamp()], + [np.nan], + ], + ) + + df = pd.DataFrame( + [ + [pd.Timestamp("2017-12-19 00:00:00", tz="CET")], + [pd.Timestamp("1724-12-20 20:20:20", tz="CET")], + [np.nan], + ] + ) + table = table_from_frame(df) + self.assertEqual(pytz.timezone("CET"), table.domain.variables[0].timezone) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()], + [pd.Timestamp("1724-12-20 20:20:20+1").timestamp()], + [np.nan], + ], + ) + + df = pd.DataFrame( + [ + [pd.Timestamp("2017-12-19 00:00:00", tz="CET")], + [pd.Timestamp("1724-12-20 20:20:20")], + [np.nan], + ] + ) + table = table_from_frame(df) + self.assertEqual(pytz.utc, table.domain.variables[0].timezone) + np.testing.assert_equal( + table.X, + [ + [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()], + [pd.Timestamp("1724-12-20 20:20:20").timestamp()], + [np.nan], + ], + ) + + def test_time_variable_compatible(self): + from Orange.data.pandas_compat import table_from_frame + + def to_df(val): + return pd.DataFrame([[pd.Timestamp(val)]]) + + for datestr, timestamp, outstr in TestTimeVariable.TESTS: + var = TimeVariable("time") + var_parse = var.to_val(datestr) + try: + pandas_parse = table_from_frame(to_df(datestr)).X[0, 0] + except ValueError: + # pandas cannot parse some formats in the list skip them + continue + if not (np.isnan(var_parse) and np.isnan(pandas_parse)): + # nan == nan => False + self.assertEqual(var_parse, pandas_parse) + self.assertEqual(pandas_parse, timestamp) + + self.assertEqual(var.repr_val(var_parse), var.repr_val(var_parse)) + self.assertEqual(outstr, var.repr_val(var_parse)) + @unittest.skip("Convert all Orange demo dataset. It takes about 5s which is way to slow") def test_table_to_frame_on_all_orange_dataset(self): from os import listdir diff --git a/Orange/data/variable.py b/Orange/data/variable.py index b538ef082fc..19a9b3f573a 100644 --- a/Orange/data/variable.py +++ b/Orange/data/variable.py @@ -934,7 +934,7 @@ def __init__(self, date_string): # UTC offset and associated timezone. If parsed datetime values provide an # offset, it is used for display. If not all values have the same offset, # +0000 (=UTC) timezone is used and utc_offset is set to False. - utc_offset = None + _utc_offset = None timezone = timezone.utc def __init__(self, *args, have_date=0, have_time=0, **kwargs): @@ -942,6 +942,22 @@ def __init__(self, *args, have_date=0, have_time=0, **kwargs): self.have_date = have_date self.have_time = have_time + @property + def utc_offset(self): + warnings.warn( + "utc_offset is deprecated and will be removed in Orange 3.31", + OrangeDeprecationWarning + ) + return self._utc_offset + + @utc_offset.setter + def utc_offset(self, val): + warnings.warn( + "utc_offset is deprecated and will be removed in Orange 3.31", + OrangeDeprecationWarning + ) + self._utc_offset = val + def copy(self, compute_value=Variable._CopyComputeValue, *, name=None, **_): return super().copy(compute_value=compute_value, name=name, have_date=self.have_date, have_time=self.have_time) @@ -1030,12 +1046,12 @@ def parse(self, datestr): # Remember UTC offset. If not all parsed values share the same offset, # remember none of it. offset = dt.utcoffset() - if self.utc_offset is not False: - if offset and self.utc_offset is None: - self.utc_offset = offset + if self._utc_offset is not False: + if offset and self._utc_offset is None: + self._utc_offset = offset self.timezone = timezone(offset) - elif self.utc_offset != offset: - self.utc_offset = False + elif self._utc_offset != offset: + self._utc_offset = False self.timezone = timezone.utc # Convert time to UTC timezone. In dates without timezone, From 672de2ff35bade6c5b3ef2796957f42d0a359e86 Mon Sep 17 00:00:00 2001 From: Primoz Godec Date: Wed, 11 Aug 2021 09:21:01 +0200 Subject: [PATCH 2/2] TimeVariable: handle different timezones --- Orange/data/tests/test_pandas.py | 1 - Orange/data/tests/test_variable.py | 10 +++++++- Orange/data/variable.py | 38 +++++++++++++++++++++++------- 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py index 8975f508ca3..e15c7908e37 100644 --- a/Orange/data/tests/test_pandas.py +++ b/Orange/data/tests/test_pandas.py @@ -309,7 +309,6 @@ def test_table_from_frame_timezones(self): ] ) table = table_from_frame(df) - self.assertIsNone(table.domain.variables[0].utc_offset) self.assertEqual(table.domain.variables[0].timezone, timezone.utc) df = pd.DataFrame( diff --git a/Orange/data/tests/test_variable.py b/Orange/data/tests/test_variable.py index e26cfdeb5b9..0bf5ff157f7 100644 --- a/Orange/data/tests/test_variable.py +++ b/Orange/data/tests/test_variable.py @@ -15,7 +15,6 @@ import numpy as np import scipy.sparse as sp -import Orange from Orange.data import Variable, ContinuousVariable, DiscreteVariable, \ StringVariable, TimeVariable, Unknown, Value, Table from Orange.data.io import CSVReader @@ -695,6 +694,15 @@ def varcls_modified(self, name): var.have_time = 1 return var + def test_remove_deprecated_utc_offset(self): + """ When this test start to fail: + - remove all marked locations in TimeVariable class + - uncomment new implementation for setting timezones in parse method + - remove this test + """ + import Orange # pylint: disable=import-outside-toplevel + self.assertLess(Orange.__version__, "3.32") + PickleContinuousVariable = create_pickling_tests( "PickleContinuousVariable", diff --git a/Orange/data/variable.py b/Orange/data/variable.py index 19a9b3f573a..63001923916 100644 --- a/Orange/data/variable.py +++ b/Orange/data/variable.py @@ -934,18 +934,19 @@ def __init__(self, date_string): # UTC offset and associated timezone. If parsed datetime values provide an # offset, it is used for display. If not all values have the same offset, # +0000 (=UTC) timezone is used and utc_offset is set to False. - _utc_offset = None - timezone = timezone.utc + _utc_offset = None # deprecated - remove in 3.32 + _timezone = None def __init__(self, *args, have_date=0, have_time=0, **kwargs): super().__init__(*args, **kwargs) self.have_date = have_date self.have_time = have_time + # deprecated - remove in 3.32 - from here @property def utc_offset(self): warnings.warn( - "utc_offset is deprecated and will be removed in Orange 3.31", + "utc_offset is deprecated and will be removed in Orange 3.32", OrangeDeprecationWarning ) return self._utc_offset @@ -953,10 +954,32 @@ def utc_offset(self): @utc_offset.setter def utc_offset(self, val): warnings.warn( - "utc_offset is deprecated and will be removed in Orange 3.31", + "utc_offset is deprecated and will be removed in Orange 3.32ß", OrangeDeprecationWarning ) self._utc_offset = val + # remove to here + + @property + def timezone(self): + if self._timezone is None or self._timezone == "different timezones": + return timezone.utc + else: + return self._timezone + + @timezone.setter + def timezone(self, tz): + """ + Set timezone value: + - if self._timezone is None set it to new timezone + - if current timezone is different that new indicate that TimeVariable + have two date-times with different timezones + - if timezones are same keep it + """ + if self._timezone is None: + self._timezone = tz + elif tz != self.timezone: + self._timezone = "different timezones" def copy(self, compute_value=Variable._CopyComputeValue, *, name=None, **_): return super().copy(compute_value=compute_value, name=name, @@ -1043,16 +1066,15 @@ def parse(self, datestr): else: raise self.InvalidDateTimeFormatError(datestr) - # Remember UTC offset. If not all parsed values share the same offset, - # remember none of it. offset = dt.utcoffset() + self.timezone = timezone(offset) if offset is not None else None + # deprecated - remove in 3.32 - from here if self._utc_offset is not False: if offset and self._utc_offset is None: self._utc_offset = offset - self.timezone = timezone(offset) elif self._utc_offset != offset: self._utc_offset = False - self.timezone = timezone.utc + # remove to here # Convert time to UTC timezone. In dates without timezone, # localtime is assumed. See also: