Skip to content

Commit

Permalink
Improving cast for date columns
Browse files Browse the repository at this point in the history
  • Loading branch information
igorborgest committed Nov 23, 2019
1 parent 8e5853b commit 3e26250
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 4 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

> Utility belt to handle data on AWS.
[![Release](https://img.shields.io/badge/release-0.0.22-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Release](https://img.shields.io/badge/release-0.0.23-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__title__ = "awswrangler"
__description__ = "Utility belt to handle data on AWS."
__version__ = "0.0.22"
__version__ = "0.0.23"
__license__ = "Apache License 2.0"
4 changes: 2 additions & 2 deletions awswrangler/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def _apply_dates_to_generator(generator, parse_dates):
for df in generator:
if len(df.index) > 0:
for col in parse_dates:
df[col] = df[col].dt.date
df[col] = df[col].dt.date.replace(to_replace={pd.NaT: None})
yield df

def to_csv(
Expand Down Expand Up @@ -788,7 +788,7 @@ def _cast_pandas(dataframe: pd.DataFrame, cast_columns: Dict[str, str]) -> pd.Da
if pandas_type == "datetime64":
dataframe[col] = pd.to_datetime(dataframe[col])
elif pandas_type == "date":
dataframe[col] = pd.to_datetime(dataframe[col]).dt.date
dataframe[col] = pd.to_datetime(dataframe[col]).dt.date.replace(to_replace={pd.NaT: None})
else:
dataframe[col] = dataframe[col].astype(pandas_type, skipna=True)
return dataframe
Expand Down
31 changes: 31 additions & 0 deletions testing/test_awswrangler/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1249,3 +1249,34 @@ def test_to_parquet_date_null(session, bucket, database):

assert df[df.col1 == "val2"].iloc[0].datecol == df2[df2.col1 == "val2"].iloc[0].datecol
assert df2[df2.col1 == "val2"].iloc[0].datecol == df3[df3.col1 == "val2"].iloc[0].datecol is None


def test_to_parquet_date_null_at_first(session, bucket, database):
df = pd.DataFrame({
"col1": ["val0", "val1", "val2", "val3", "val4", "val5", "val6", "val7", "val8", "val9"],
"datecol": [None, pd.NaT, None, pd.NaT, None, pd.NaT, None, pd.NaT, None,
date(2019, 11, 9)],
})
path = f"s3://{bucket}/test/"
session.pandas.to_parquet(dataframe=df,
database=database,
table="test",
path=path,
mode="overwrite",
preserve_index=False,
procs_cpu_bound=1,
cast_columns={"datecol": "date"})
df2 = None
for counter in range(10): # Retrying to workaround s3 eventual consistency
sleep(1)
df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
if len(df.index) == len(df2.index):
break

session.s3.delete_objects(path=path)

assert len(list(df.columns)) == len(list(df2.columns))
assert len(df.index) == len(df2.index)

assert df[df.col1 == "val9"].iloc[0].datecol == df2[df2.col1 == "val9"].iloc[0].datecol == date(2019, 11, 9)
assert df[df.col1 == "val0"].iloc[0].datecol == df2[df2.col1 == "val0"].iloc[0].datecol is None

0 comments on commit 3e26250

Please sign in to comment.