From bfa2eae58805353d0f4489c50232ccbbe70617be Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:47:39 -0700 Subject: [PATCH 1/4] Raise NotImplementedError in to_datetime if Z (or tz component) in string --- python/cudf/cudf/core/column/datetime.py | 11 ++++++----- python/cudf/cudf/tests/test_datetime.py | 6 ++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index da6c4fb858c..a08ffde4f8b 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -651,11 +651,12 @@ def infer_format(element: str, **kwargs) -> str: raise ValueError("Unable to infer the timestamp format from the data") if len(second_parts) > 1: - # "Z" indicates Zulu time(widely used in aviation) - Which is - # UTC timezone that currently cudf only supports. Having any other - # unsupported timezone will let the code fail below - # with a ValueError. - second_parts.remove("Z") + # We may have a non-digit, timezone-like component + # like Z, UTC-3, +01:00 + if any(re.search(r"\D", part) for part in second_parts): + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) second_part = "".join(second_parts[1:]) if len(second_part) > 1: diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4c20258ae67..af06511b82c 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2158,6 +2158,12 @@ def test_format_timezone_not_implemented(code): ) +@pytest.mark.parametrize("tz", ["Z", "UTC-3", "+01:00"]) +def test_no_format_timezone_not_implemented(tz): + with pytest.raises(NotImplementedError): + cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) + + @pytest.mark.parametrize("arg", [True, False]) def test_args_not_datetime_typerror(arg): with pytest.raises(TypeError): From ff15ed40240cb95e6473cced8f67ac9defe6c1ef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Sep 2023 17:19:22 -0700 Subject: [PATCH 2/4] Fix check and adjust test --- python/cudf/cudf/core/column/datetime.py | 4 ++++ python/cudf/cudf/tests/test_datetime.py | 8 ++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index a08ffde4f8b..7775723e267 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -631,6 +631,10 @@ def infer_format(element: str, **kwargs) -> str: fmt = _guess_datetime_format(element, **kwargs) if fmt is not None: + if "%z" in fmt or "%Z" in fmt: + raise NotImplementedError( + "cuDF does not yet support timezone-aware datetimes" + ) return fmt element_parts = element.split(".") diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index af06511b82c..dd544bc5187 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1278,12 +1278,8 @@ def test_datetime_reductions(data, op, dtype): @pytest.mark.parametrize("dtype", DATETIME_TYPES) def test_datetime_infer_format(data, dtype): sr = cudf.Series(data) - psr = pd.Series(data) - - expected = psr.astype(dtype) - actual = sr.astype(dtype) - - assert_eq(expected, actual) + with pytest.raises(NotImplementedError): + sr.astype(dtype) def test_dateoffset_instance_subclass_check(): From c9c9a03ef0ce1b37e4926f32b06edebc42b9b412 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Sep 2023 11:42:08 -0700 Subject: [PATCH 3/4] Update test_string_astype --- python/cudf/cudf/tests/test_string.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 2bddd93ccb8..d54027eb707 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -200,12 +200,12 @@ def test_string_astype(dtype): data = ["True", "False", "True", "False", "False"] elif dtype.startswith("datetime64"): data = [ - "2019-06-04T00:00:00Z", - "2019-06-04T12:12:12Z", - "2019-06-03T00:00:00Z", - "2019-05-04T00:00:00Z", - "2018-06-04T00:00:00Z", - "1922-07-21T01:02:03Z", + "2019-06-04T00:00:00", + "2019-06-04T12:12:12", + "2019-06-03T00:00:00", + "2019-05-04T00:00:00", + "2018-06-04T00:00:00", + "1922-07-21T01:02:03", ] elif dtype == "str" or dtype == "object": data = ["ab", "cd", "ef", "gh", "ij"] From f4b0d0886a88773d4ac3f6f594411cfc9f657610 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Sep 2023 10:59:56 -0700 Subject: [PATCH 4/4] Reparameterize test --- python/cudf/cudf/tests/test_datetime.py | 43 +++++++++++-------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index dd544bc5187..5cab19eedc6 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1250,36 +1250,31 @@ def test_datetime_reductions(data, op, dtype): assert_eq(expected, actual) +@pytest.mark.parametrize("timezone", ["naive", "UTC"]) @pytest.mark.parametrize( "data", [ - np.datetime_as_string( - np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"), - timezone="UTC", - ), - np.datetime_as_string( - np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"), - timezone="UTC", - ), + np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[m]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[m]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[ns]"), + np.arange("2002-10-27T04:30", 10 * 60, 1, dtype="M8[us]"), + np.arange("2002-10-27T04:30", 4 * 60, 60, dtype="M8[s]"), ], ) @pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_infer_format(data, dtype): - sr = cudf.Series(data) - with pytest.raises(NotImplementedError): - sr.astype(dtype) +def test_datetime_infer_format(data, timezone, dtype): + ts_data = np.datetime_as_string(data, timezone=timezone) + sr = cudf.Series(ts_data) + if timezone == "naive": + psr = pd.Series(ts_data) + + expected = psr.astype(dtype) + actual = sr.astype(dtype) + + assert_eq(expected, actual) + else: + with pytest.raises(NotImplementedError): + sr.astype(dtype) def test_dateoffset_instance_subclass_check():