From 833bfda4a457fe3669ef1b3b0d493a14d34eb747 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 13 Sep 2023 12:58:54 -0700 Subject: [PATCH 1/4] Fix name of the column object --- python/cudf/cudf/core/column_accessor.py | 2 -- python/cudf/cudf/core/dataframe.py | 24 ++++++++++++++++++++++-- python/cudf/cudf/core/indexed_frame.py | 4 +++- python/cudf/cudf/tests/test_dataframe.py | 7 +++++++ 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index bec9c367ba9..cb79a30422e 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -197,8 +197,6 @@ def nlevels(self) -> int: @property def name(self) -> Any: - if len(self._data) == 0: - return None return self.level_names[-1] @property diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5a3d25a08a7..572837979cc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -665,7 +665,10 @@ def __init__( len(self), dtype="object", masked=True ) for k in columns - } + }, + level_names=tuple(columns.names) + if isinstance(columns, pd.Index) + else None, ) elif isinstance(data, ColumnAccessor): raise TypeError( @@ -712,6 +715,11 @@ def __init__( self._data = new_df._data self._index = new_df._index + self._data._level_names = ( + tuple(columns.names) + if isinstance(columns, pd.Index) + else self._data._level_names + ) elif len(data) > 0 and isinstance(data[0], Series): self._init_from_series_list( data=data, columns=columns, index=index @@ -834,6 +842,11 @@ def _init_from_series_list(self, data, columns, index): self._data[col_name] = column.column_empty( row_count=len(self), dtype=None, masked=True ) + self._data._level_names = ( + tuple(columns.names) + if isinstance(columns, pd.Index) + else self._data._level_names + ) self._data = self._data.select_by_label(columns) @_cudf_nvtx_annotate @@ -956,6 +969,11 @@ def _init_from_dict_like( data[col_name], nan_as_null=nan_as_null, ) + self._data._level_names = ( + tuple(columns.names) + if isinstance(columns, pd.Index) + else self._data._level_names + ) @classmethod def _from_data( @@ -5125,7 +5143,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) df = cls._from_data(data, index) - df._data._level_names = list(dataframe.columns.names) + df._data._level_names = tuple(dataframe.columns.names) # Set columns only if it is a MultiIndex if isinstance(dataframe.columns, pd.MultiIndex): @@ -5371,6 +5389,8 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): df = df.set_index(index) else: df._index = as_index(index) + if isinstance(columns, pd.Index): + df._data._level_names = list(columns.names) return df @classmethod diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 62e091b29b5..1796baa6147 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2661,7 +2661,9 @@ def _reindex( data=cudf.core.column_accessor.ColumnAccessor( cols, multiindex=self._data.multiindex, - level_names=self._data.level_names, + level_names=tuple(column_names.names) + if isinstance(column_names, pd.Index) + else None, ), index=index, ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 61372bab3ad..53a221e66a7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10349,3 +10349,10 @@ def test_dataframe_round_builtin(digits): actual = round(gdf, digits) assert_eq(expected, actual) + + +def test_dataframe_empty_columns(): + pdf = pd.DataFrame(columns=pd.Index(["a", "b", "c"], name="NAME")) + gdf = cudf.DataFrame(columns=pd.Index(["a", "b", "c"], name="NAME")) + + assert_eq(pdf, gdf, check_index_type=False) From bf33ebb9d886e9981c57ca51f6f5c387ce9e8bbb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 13 Sep 2023 13:31:26 -0700 Subject: [PATCH 2/4] add tests --- python/cudf/cudf/tests/test_dataframe.py | 34 +++++++++++++++++------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 53a221e66a7..02a2dc80dbd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6362,6 +6362,7 @@ def test_df_series_dataframe_astype_dtype_dict(copy): ([range(100), range(100)], ["range" + str(i) for i in range(100)]), (((1, 2, 3), (1, 2, 3)), ["tuple0", "tuple1", "tuple2"]), ([[1, 2, 3]], ["list col1", "list col2", "list col3"]), + ([[1, 2, 3]], pd.Index(["col1", "col2", "col3"], name="rapids")), ([range(100)], ["range" + str(i) for i in range(100)]), (((1, 2, 3),), ["k1", "k2", "k3"]), ], @@ -7937,6 +7938,7 @@ def test_series_empty(ps): @pytest.mark.parametrize( "data", [ + None, [], [1], {"a": [10, 11, 12]}, @@ -7947,7 +7949,10 @@ def test_series_empty(ps): }, ], ) -@pytest.mark.parametrize("columns", [["a"], ["another column name"], None]) +@pytest.mark.parametrize( + "columns", + [["a"], ["another column name"], None, pd.Index(["a"], name="index name")], +) def test_dataframe_init_with_columns(data, columns): pdf = pd.DataFrame(data, columns=columns) gdf = cudf.DataFrame(data, columns=columns) @@ -8015,7 +8020,16 @@ def test_dataframe_init_with_columns(data, columns): ], ) @pytest.mark.parametrize( - "columns", [None, ["0"], [0], ["abc"], [144, 13], [2, 1, 0]] + "columns", + [ + None, + ["0"], + [0], + ["abc"], + [144, 13], + [2, 1, 0], + pd.Index(["abc"], name="custom_name"), + ], ) def test_dataframe_init_from_series_list(data, ignore_dtype, columns): gd_data = [cudf.from_pandas(obj) for obj in data] @@ -10207,7 +10221,14 @@ def test_dataframe_binop_with_datetime_index(): @pytest.mark.parametrize( - "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"]) + "columns", + ( + [], + ["c", "a"], + ["a", "d", "b", "e", "c"], + ["a", "b", "c"], + pd.Index(["b", "a"], name="custom_name"), + ), ) @pytest.mark.parametrize("index", (None, [4, 5, 6])) def test_dataframe_dict_like_with_columns(columns, index): @@ -10349,10 +10370,3 @@ def test_dataframe_round_builtin(digits): actual = round(gdf, digits) assert_eq(expected, actual) - - -def test_dataframe_empty_columns(): - pdf = pd.DataFrame(columns=pd.Index(["a", "b", "c"], name="NAME")) - gdf = cudf.DataFrame(columns=pd.Index(["a", "b", "c"], name="NAME")) - - assert_eq(pdf, gdf, check_index_type=False) From eb53a5ac94c9c84df3dc928823840ec6378e386b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 13 Sep 2023 13:52:21 -0700 Subject: [PATCH 3/4] fix pytest --- python/cudf/cudf/tests/test_dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 02a2dc80dbd..f7b10527de8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10227,7 +10227,7 @@ def test_dataframe_binop_with_datetime_index(): ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"], - pd.Index(["b", "a"], name="custom_name"), + pd.Index(["b", "a", "c"], name="custom_name"), ), ) @pytest.mark.parametrize("index", (None, [4, 5, 6])) @@ -10235,7 +10235,7 @@ def test_dataframe_dict_like_with_columns(columns, index): data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} expect = pd.DataFrame(data, columns=columns, index=index) actual = cudf.DataFrame(data, columns=columns, index=index) - if index is None and columns == []: + if index is None and len(columns) == 0: # We make an empty range index, pandas makes an empty index expect = expect.reset_index(drop=True) assert_eq(expect, actual) From 0b7d78b824ea42438145be734633132d8be927fb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 13 Sep 2023 13:53:37 -0700 Subject: [PATCH 4/4] cleanup --- python/cudf/cudf/core/dataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 572837979cc..e5ba8bf0b63 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5390,7 +5390,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): else: df._index = as_index(index) if isinstance(columns, pd.Index): - df._data._level_names = list(columns.names) + df._data._level_names = tuple(columns.names) return df @classmethod @@ -5448,7 +5448,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): data, nan_as_null=nan_as_null ) if isinstance(columns, pd.Index): - df._data._level_names = list(columns.names) + df._data._level_names = tuple(columns.names) if index is None: df._index = RangeIndex(start=0, stop=len(data))