From b2c7c75edd9e9ad5b7e1cc5b370402bb47a951dc Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Fri, 6 Aug 2021 14:09:57 +0200
Subject: [PATCH 1/2] pandas_compat: fix conversion of datetime
---
Orange/data/pandas_compat.py | 57 +++++++-
Orange/data/tests/test_pandas.py | 244 ++++++++++++++++++++++++++++++-
Orange/data/variable.py | 28 +++-
3 files changed, 314 insertions(+), 15 deletions(-)
diff --git a/Orange/data/pandas_compat.py b/Orange/data/pandas_compat.py
index e05bf9d8fba..7321b6d7f72 100644
--- a/Orange/data/pandas_compat.py
+++ b/Orange/data/pandas_compat.py
@@ -157,13 +157,56 @@ def _is_datetime(s):
return True
try:
if is_object_dtype(s):
- pd.to_datetime(s, infer_datetime_format=True)
+ # utc=True - to allow different timezones in a series object
+ pd.to_datetime(s, infer_datetime_format=True, utc=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False
+def _convert_datetime(series, var):
+ def col_type(dt):
+ """Test if is date, time or datetime"""
+ dt_nonnat = dt[~pd.isnull(dt)] # nat == nat is False
+ if (dt_nonnat.dt.floor("d") == dt_nonnat).all():
+ # all times are 00:00:00.0 - pure date
+ return 1, 0
+ elif (dt_nonnat.dt.date == pd.Timestamp("now").date()).all():
+ # all dates are today's date - pure time
+ return 0, 1 # pure time
+ else:
+ # else datetime
+ return 1, 1
+
+ try:
+ dt = pd.to_datetime(series)
+ except ValueError:
+ # series with type object and different timezones will raise a
+ # ValueError - normalizing to utc
+ dt = pd.to_datetime(series, utc=True)
+
+ # set variable type to date, time or datetime
+ var.have_date, var.have_time = col_type(dt)
+
+ if dt.dt.tz is not None:
+ # set timezone if available and convert to utc
+ var.timezone = dt.dt.tz
+ dt = dt.dt.tz_convert("UTC")
+
+ if var.have_time and not var.have_date:
+ # if time only measure seconds from midnight - equal to setting date
+ # to unix epoch
+ return (
+ (dt.dt.tz_localize(None) - pd.Timestamp("now").normalize())
+ / pd.Timedelta("1s")
+ ).values
+
+ return (
+ (dt.dt.tz_localize(None) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")
+ ).values
+
+
def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
_role = df.orange_role
@@ -210,6 +253,11 @@ def vars_from_df(df, role=None, force_nominal=False):
Mcols.append(column)
Mexpr.append(None)
metas.append(var)
+ elif _is_datetime(s):
+ var = TimeVariable(str(column))
+ attrs.append(var)
+ Xcols.append(column)
+ Xexpr.append(_convert_datetime)
elif _is_discrete(s, force_nominal):
discrete = s.astype('category').cat
var = DiscreteVariable(str(column),
@@ -224,13 +272,6 @@ def to_cat(s, _):
return np.asarray(x)
Xexpr.append(to_cat)
- elif _is_datetime(s):
- var = TimeVariable(str(column))
- attrs.append(var)
- Xcols.append(column)
- Xexpr.append(lambda s, v: np.asarray(
- s.astype('str').replace('NaT', np.nan).map(v.parse)
- ))
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py
index 1ffc4620794..8975f508ca3 100644
--- a/Orange/data/tests/test_pandas.py
+++ b/Orange/data/tests/test_pandas.py
@@ -1,13 +1,16 @@
# pylint: disable=import-outside-toplevel
-
import unittest
+from datetime import date, datetime, timezone
+
import numpy as np
+import pytz
from scipy.sparse import csr_matrix
import scipy.sparse as sp
from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable, Table, Domain, \
StringVariable
from Orange.data.pandas_compat import OrangeDataFrame
+from Orange.data.tests.test_variable import TestTimeVariable
try:
import pandas as pd
@@ -164,6 +167,245 @@ def test_not_orangedf(self):
for v1, v2 in zip(vars1, vars2):
self.assertEqual(type(v1), type(v2))
+ def test_table_from_frame_date(self):
+ from Orange.data.pandas_compat import table_from_frame
+
+ df = pd.DataFrame(
+ [[pd.Timestamp("2017-12-19")], [pd.Timestamp("1724-12-20")], [np.nan]]
+ )
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19").timestamp()],
+ [pd.Timestamp("1724-12-20").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 0)
+ self.assertEqual(table.domain.variables[0].have_date, 1)
+
+ df = pd.DataFrame([["2017-12-19"], ["1724-12-20"], [np.nan]])
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19").timestamp()],
+ [pd.Timestamp("1724-12-20").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 0)
+ self.assertEqual(table.domain.variables[0].have_date, 1)
+
+ df = pd.DataFrame([[date(2017, 12, 19)], [date(1724, 12, 20)], [np.nan]])
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19").timestamp()],
+ [pd.Timestamp("1724-12-20").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 0)
+ self.assertEqual(table.domain.variables[0].have_date, 1)
+
+ def test_table_from_frame_time(self):
+ from Orange.data.pandas_compat import table_from_frame
+
+ df = pd.DataFrame(
+ [[pd.Timestamp("00:00:00.25")], [pd.Timestamp("20:20:20.30")], [np.nan]]
+ )
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
+ [pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 1)
+ self.assertEqual(table.domain.variables[0].have_date, 0)
+
+ df = pd.DataFrame([["00:00:00.25"], ["20:20:20.30"], [np.nan]])
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
+ [pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 1)
+ self.assertEqual(table.domain.variables[0].have_date, 0)
+
+ def test_table_from_frame_datetime(self):
+ from Orange.data.pandas_compat import table_from_frame
+
+ df = pd.DataFrame(
+ [
+ [pd.Timestamp("2017-12-19 00:00:00.50")],
+ [pd.Timestamp("1724-12-20 20:20:20.30")],
+ [np.nan],
+ ]
+ )
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
+ [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 1)
+ self.assertEqual(table.domain.variables[0].have_date, 1)
+
+ df = pd.DataFrame(
+ [["2017-12-19 00:00:00.50"], ["1724-12-20 20:20:20.30"], [np.nan]]
+ )
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
+ [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 1)
+ self.assertEqual(table.domain.variables[0].have_date, 1)
+
+ df = pd.DataFrame(
+ [
+ [datetime(2017, 12, 19, 0, 0, 0, 500000)],
+ [datetime(1724, 12, 20, 20, 20, 20, 300000)],
+ [np.nan],
+ ]
+ )
+ table = table_from_frame(df)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
+ [pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
+ [np.nan],
+ ],
+ )
+ self.assertEqual(table.domain.variables[0].have_time, 1)
+ self.assertEqual(table.domain.variables[0].have_date, 1)
+
+ def test_table_from_frame_timezones(self):
+ from Orange.data.pandas_compat import table_from_frame
+
+ df = pd.DataFrame(
+ [
+ [pd.Timestamp("2017-12-19 00:00:00")],
+ [pd.Timestamp("1724-12-20 20:20:20")],
+ [np.nan],
+ ]
+ )
+ table = table_from_frame(df)
+ self.assertIsNone(table.domain.variables[0].utc_offset)
+ self.assertEqual(table.domain.variables[0].timezone, timezone.utc)
+
+ df = pd.DataFrame(
+ [
+ [pd.Timestamp("2017-12-19 00:00:00Z")],
+ [pd.Timestamp("1724-12-20 20:20:20Z")],
+ [np.nan],
+ ]
+ )
+ table = table_from_frame(df)
+ self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19 00:00:00").timestamp()],
+ [pd.Timestamp("1724-12-20 20:20:20").timestamp()],
+ [np.nan],
+ ],
+ )
+
+ df = pd.DataFrame(
+ [
+ [pd.Timestamp("2017-12-19 00:00:00+1")],
+ [pd.Timestamp("1724-12-20 20:20:20+1")],
+ [np.nan],
+ ]
+ )
+ table = table_from_frame(df)
+ self.assertEqual(pytz.FixedOffset(60), table.domain.variables[0].timezone)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
+ [pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
+ [np.nan],
+ ],
+ )
+
+ df = pd.DataFrame(
+ [
+ [pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
+ [pd.Timestamp("1724-12-20 20:20:20", tz="CET")],
+ [np.nan],
+ ]
+ )
+ table = table_from_frame(df)
+ self.assertEqual(pytz.timezone("CET"), table.domain.variables[0].timezone)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
+ [pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
+ [np.nan],
+ ],
+ )
+
+ df = pd.DataFrame(
+ [
+ [pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
+ [pd.Timestamp("1724-12-20 20:20:20")],
+ [np.nan],
+ ]
+ )
+ table = table_from_frame(df)
+ self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
+ np.testing.assert_equal(
+ table.X,
+ [
+ [pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
+ [pd.Timestamp("1724-12-20 20:20:20").timestamp()],
+ [np.nan],
+ ],
+ )
+
+ def test_time_variable_compatible(self):
+ from Orange.data.pandas_compat import table_from_frame
+
+ def to_df(val):
+ return pd.DataFrame([[pd.Timestamp(val)]])
+
+ for datestr, timestamp, outstr in TestTimeVariable.TESTS:
+ var = TimeVariable("time")
+ var_parse = var.to_val(datestr)
+ try:
+ pandas_parse = table_from_frame(to_df(datestr)).X[0, 0]
+ except ValueError:
+ # pandas cannot parse some formats in the list skip them
+ continue
+ if not (np.isnan(var_parse) and np.isnan(pandas_parse)):
+ # nan == nan => False
+ self.assertEqual(var_parse, pandas_parse)
+ self.assertEqual(pandas_parse, timestamp)
+
+ self.assertEqual(var.repr_val(var_parse), var.repr_val(var_parse))
+ self.assertEqual(outstr, var.repr_val(var_parse))
+
@unittest.skip("Convert all Orange demo dataset. It takes about 5s which is way to slow")
def test_table_to_frame_on_all_orange_dataset(self):
from os import listdir
diff --git a/Orange/data/variable.py b/Orange/data/variable.py
index b538ef082fc..19a9b3f573a 100644
--- a/Orange/data/variable.py
+++ b/Orange/data/variable.py
@@ -934,7 +934,7 @@ def __init__(self, date_string):
# UTC offset and associated timezone. If parsed datetime values provide an
# offset, it is used for display. If not all values have the same offset,
# +0000 (=UTC) timezone is used and utc_offset is set to False.
- utc_offset = None
+ _utc_offset = None
timezone = timezone.utc
def __init__(self, *args, have_date=0, have_time=0, **kwargs):
@@ -942,6 +942,22 @@ def __init__(self, *args, have_date=0, have_time=0, **kwargs):
self.have_date = have_date
self.have_time = have_time
+ @property
+ def utc_offset(self):
+ warnings.warn(
+ "utc_offset is deprecated and will be removed in Orange 3.31",
+ OrangeDeprecationWarning
+ )
+ return self._utc_offset
+
+ @utc_offset.setter
+ def utc_offset(self, val):
+ warnings.warn(
+ "utc_offset is deprecated and will be removed in Orange 3.31",
+ OrangeDeprecationWarning
+ )
+ self._utc_offset = val
+
def copy(self, compute_value=Variable._CopyComputeValue, *, name=None, **_):
return super().copy(compute_value=compute_value, name=name,
have_date=self.have_date, have_time=self.have_time)
@@ -1030,12 +1046,12 @@ def parse(self, datestr):
# Remember UTC offset. If not all parsed values share the same offset,
# remember none of it.
offset = dt.utcoffset()
- if self.utc_offset is not False:
- if offset and self.utc_offset is None:
- self.utc_offset = offset
+ if self._utc_offset is not False:
+ if offset and self._utc_offset is None:
+ self._utc_offset = offset
self.timezone = timezone(offset)
- elif self.utc_offset != offset:
- self.utc_offset = False
+ elif self._utc_offset != offset:
+ self._utc_offset = False
self.timezone = timezone.utc
# Convert time to UTC timezone. In dates without timezone,
From 672de2ff35bade6c5b3ef2796957f42d0a359e86 Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Wed, 11 Aug 2021 09:21:01 +0200
Subject: [PATCH 2/2] TimeVariable: handle different timezones
---
Orange/data/tests/test_pandas.py | 1 -
Orange/data/tests/test_variable.py | 10 +++++++-
Orange/data/variable.py | 38 +++++++++++++++++++++++-------
3 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py
index 8975f508ca3..e15c7908e37 100644
--- a/Orange/data/tests/test_pandas.py
+++ b/Orange/data/tests/test_pandas.py
@@ -309,7 +309,6 @@ def test_table_from_frame_timezones(self):
]
)
table = table_from_frame(df)
- self.assertIsNone(table.domain.variables[0].utc_offset)
self.assertEqual(table.domain.variables[0].timezone, timezone.utc)
df = pd.DataFrame(
diff --git a/Orange/data/tests/test_variable.py b/Orange/data/tests/test_variable.py
index e26cfdeb5b9..0bf5ff157f7 100644
--- a/Orange/data/tests/test_variable.py
+++ b/Orange/data/tests/test_variable.py
@@ -15,7 +15,6 @@
import numpy as np
import scipy.sparse as sp
-import Orange
from Orange.data import Variable, ContinuousVariable, DiscreteVariable, \
StringVariable, TimeVariable, Unknown, Value, Table
from Orange.data.io import CSVReader
@@ -695,6 +694,15 @@ def varcls_modified(self, name):
var.have_time = 1
return var
+ def test_remove_deprecated_utc_offset(self):
+ """ When this test start to fail:
+ - remove all marked locations in TimeVariable class
+ - uncomment new implementation for setting timezones in parse method
+ - remove this test
+ """
+ import Orange # pylint: disable=import-outside-toplevel
+ self.assertLess(Orange.__version__, "3.32")
+
PickleContinuousVariable = create_pickling_tests(
"PickleContinuousVariable",
diff --git a/Orange/data/variable.py b/Orange/data/variable.py
index 19a9b3f573a..63001923916 100644
--- a/Orange/data/variable.py
+++ b/Orange/data/variable.py
@@ -934,18 +934,19 @@ def __init__(self, date_string):
# UTC offset and associated timezone. If parsed datetime values provide an
# offset, it is used for display. If not all values have the same offset,
# +0000 (=UTC) timezone is used and utc_offset is set to False.
- _utc_offset = None
- timezone = timezone.utc
+ _utc_offset = None # deprecated - remove in 3.32
+ _timezone = None
def __init__(self, *args, have_date=0, have_time=0, **kwargs):
super().__init__(*args, **kwargs)
self.have_date = have_date
self.have_time = have_time
+ # deprecated - remove in 3.32 - from here
@property
def utc_offset(self):
warnings.warn(
- "utc_offset is deprecated and will be removed in Orange 3.31",
+ "utc_offset is deprecated and will be removed in Orange 3.32",
OrangeDeprecationWarning
)
return self._utc_offset
@@ -953,10 +954,32 @@ def utc_offset(self):
@utc_offset.setter
def utc_offset(self, val):
warnings.warn(
- "utc_offset is deprecated and will be removed in Orange 3.31",
+ "utc_offset is deprecated and will be removed in Orange 3.32ß",
OrangeDeprecationWarning
)
self._utc_offset = val
+ # remove to here
+
+ @property
+ def timezone(self):
+ if self._timezone is None or self._timezone == "different timezones":
+ return timezone.utc
+ else:
+ return self._timezone
+
+ @timezone.setter
+ def timezone(self, tz):
+ """
+ Set timezone value:
+ - if self._timezone is None set it to new timezone
+ - if current timezone is different that new indicate that TimeVariable
+ have two date-times with different timezones
+ - if timezones are same keep it
+ """
+ if self._timezone is None:
+ self._timezone = tz
+ elif tz != self.timezone:
+ self._timezone = "different timezones"
def copy(self, compute_value=Variable._CopyComputeValue, *, name=None, **_):
return super().copy(compute_value=compute_value, name=name,
@@ -1043,16 +1066,15 @@ def parse(self, datestr):
else:
raise self.InvalidDateTimeFormatError(datestr)
- # Remember UTC offset. If not all parsed values share the same offset,
- # remember none of it.
offset = dt.utcoffset()
+ self.timezone = timezone(offset) if offset is not None else None
+ # deprecated - remove in 3.32 - from here
if self._utc_offset is not False:
if offset and self._utc_offset is None:
self._utc_offset = offset
- self.timezone = timezone(offset)
elif self._utc_offset != offset:
self._utc_offset = False
- self.timezone = timezone.utc
+ # remove to here
# Convert time to UTC timezone. In dates without timezone,
# localtime is assumed. See also: