Skip to content

Commit

Permalink
Merge pull request #5547 from PrimozGodec/pandas_compat-fix-time-var-…
Browse files Browse the repository at this point in the history
…coversion

[FIX] pandas_compat: fix conversion of datetime series
  • Loading branch information
lanzagar authored Aug 27, 2021
2 parents 47b4db4 + 672de2f commit 79ac616
Show file tree
Hide file tree
Showing 4 changed files with 349 additions and 21 deletions.
57 changes: 49 additions & 8 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,56 @@ def _is_datetime(s):
return True
try:
if is_object_dtype(s):
pd.to_datetime(s, infer_datetime_format=True)
# utc=True - to allow different timezones in a series object
pd.to_datetime(s, infer_datetime_format=True, utc=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False


def _convert_datetime(series, var):
def col_type(dt):
"""Test if is date, time or datetime"""
dt_nonnat = dt[~pd.isnull(dt)] # nat == nat is False
if (dt_nonnat.dt.floor("d") == dt_nonnat).all():
# all times are 00:00:00.0 - pure date
return 1, 0
elif (dt_nonnat.dt.date == pd.Timestamp("now").date()).all():
# all dates are today's date - pure time
return 0, 1 # pure time
else:
# else datetime
return 1, 1

try:
dt = pd.to_datetime(series)
except ValueError:
# series with type object and different timezones will raise a
# ValueError - normalizing to utc
dt = pd.to_datetime(series, utc=True)

# set variable type to date, time or datetime
var.have_date, var.have_time = col_type(dt)

if dt.dt.tz is not None:
# set timezone if available and convert to utc
var.timezone = dt.dt.tz
dt = dt.dt.tz_convert("UTC")

if var.have_time and not var.have_date:
# if time only measure seconds from midnight - equal to setting date
# to unix epoch
return (
(dt.dt.tz_localize(None) - pd.Timestamp("now").normalize())
/ pd.Timedelta("1s")
).values

return (
(dt.dt.tz_localize(None) - pd.Timestamp("1970-01-01")) / pd.Timedelta("1s")
).values


def vars_from_df(df, role=None, force_nominal=False):
if role is None and hasattr(df, 'orange_role'):
_role = df.orange_role
Expand Down Expand Up @@ -210,6 +253,11 @@ def vars_from_df(df, role=None, force_nominal=False):
Mcols.append(column)
Mexpr.append(None)
metas.append(var)
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(_convert_datetime)
elif _is_discrete(s, force_nominal):
discrete = s.astype('category').cat
var = DiscreteVariable(str(column),
Expand All @@ -224,13 +272,6 @@ def to_cat(s, _):
return np.asarray(x)

Xexpr.append(to_cat)
elif _is_datetime(s):
var = TimeVariable(str(column))
attrs.append(var)
Xcols.append(column)
Xexpr.append(lambda s, v: np.asarray(
s.astype('str').replace('NaT', np.nan).map(v.parse)
))
elif is_numeric_dtype(s):
var = ContinuousVariable(
# set number of decimals to 0 if int else keeps default behaviour
Expand Down
243 changes: 242 additions & 1 deletion Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# pylint: disable=import-outside-toplevel

import unittest
from datetime import date, datetime, timezone

import numpy as np
import pytz
from scipy.sparse import csr_matrix
import scipy.sparse as sp

from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable, Table, Domain, \
StringVariable
from Orange.data.pandas_compat import OrangeDataFrame
from Orange.data.tests.test_variable import TestTimeVariable

try:
import pandas as pd
Expand Down Expand Up @@ -164,6 +167,244 @@ def test_not_orangedf(self):
for v1, v2 in zip(vars1, vars2):
self.assertEqual(type(v1), type(v2))

def test_table_from_frame_date(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[[pd.Timestamp("2017-12-19")], [pd.Timestamp("1724-12-20")], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame([["2017-12-19"], ["1724-12-20"], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame([[date(2017, 12, 19)], [date(1724, 12, 20)], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19").timestamp()],
[pd.Timestamp("1724-12-20").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 0)
self.assertEqual(table.domain.variables[0].have_date, 1)

def test_table_from_frame_time(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[[pd.Timestamp("00:00:00.25")], [pd.Timestamp("20:20:20.30")], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
[pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 0)

df = pd.DataFrame([["00:00:00.25"], ["20:20:20.30"], [np.nan]])
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("1970-01-01 00:00:00.25").timestamp()],
[pd.Timestamp("1970-01-01 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 0)

def test_table_from_frame_datetime(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00.50")],
[pd.Timestamp("1724-12-20 20:20:20.30")],
[np.nan],
]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame(
[["2017-12-19 00:00:00.50"], ["1724-12-20 20:20:20.30"], [np.nan]]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

df = pd.DataFrame(
[
[datetime(2017, 12, 19, 0, 0, 0, 500000)],
[datetime(1724, 12, 20, 20, 20, 20, 300000)],
[np.nan],
]
)
table = table_from_frame(df)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00.50").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20.30").timestamp()],
[np.nan],
],
)
self.assertEqual(table.domain.variables[0].have_time, 1)
self.assertEqual(table.domain.variables[0].have_date, 1)

def test_table_from_frame_timezones(self):
from Orange.data.pandas_compat import table_from_frame

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00")],
[pd.Timestamp("1724-12-20 20:20:20")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(table.domain.variables[0].timezone, timezone.utc)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00Z")],
[pd.Timestamp("1724-12-20 20:20:20Z")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00+1")],
[pd.Timestamp("1724-12-20 20:20:20+1")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.FixedOffset(60), table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
[pd.Timestamp("1724-12-20 20:20:20", tz="CET")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.timezone("CET"), table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20+1").timestamp()],
[np.nan],
],
)

df = pd.DataFrame(
[
[pd.Timestamp("2017-12-19 00:00:00", tz="CET")],
[pd.Timestamp("1724-12-20 20:20:20")],
[np.nan],
]
)
table = table_from_frame(df)
self.assertEqual(pytz.utc, table.domain.variables[0].timezone)
np.testing.assert_equal(
table.X,
[
[pd.Timestamp("2017-12-19 00:00:00+1").timestamp()],
[pd.Timestamp("1724-12-20 20:20:20").timestamp()],
[np.nan],
],
)

def test_time_variable_compatible(self):
from Orange.data.pandas_compat import table_from_frame

def to_df(val):
return pd.DataFrame([[pd.Timestamp(val)]])

for datestr, timestamp, outstr in TestTimeVariable.TESTS:
var = TimeVariable("time")
var_parse = var.to_val(datestr)
try:
pandas_parse = table_from_frame(to_df(datestr)).X[0, 0]
except ValueError:
# pandas cannot parse some formats in the list skip them
continue
if not (np.isnan(var_parse) and np.isnan(pandas_parse)):
# nan == nan => False
self.assertEqual(var_parse, pandas_parse)
self.assertEqual(pandas_parse, timestamp)

self.assertEqual(var.repr_val(var_parse), var.repr_val(var_parse))
self.assertEqual(outstr, var.repr_val(var_parse))

@unittest.skip("Convert all Orange demo dataset. It takes about 5s which is way to slow")
def test_table_to_frame_on_all_orange_dataset(self):
from os import listdir
Expand Down
10 changes: 9 additions & 1 deletion Orange/data/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import numpy as np
import scipy.sparse as sp

import Orange
from Orange.data import Variable, ContinuousVariable, DiscreteVariable, \
StringVariable, TimeVariable, Unknown, Value, Table
from Orange.data.io import CSVReader
Expand Down Expand Up @@ -695,6 +694,15 @@ def varcls_modified(self, name):
var.have_time = 1
return var

def test_remove_deprecated_utc_offset(self):
""" When this test start to fail:
- remove all marked locations in TimeVariable class
- uncomment new implementation for setting timezones in parse method
- remove this test
"""
import Orange # pylint: disable=import-outside-toplevel
self.assertLess(Orange.__version__, "3.32")


PickleContinuousVariable = create_pickling_tests(
"PickleContinuousVariable",
Expand Down
Loading

0 comments on commit 79ac616

Please sign in to comment.