rapidsai · Matt711 · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024 · Oct 29, 2024
@@ -1362,7 +1362,7 @@ table_with_metadata read_csv(
  */
 
 /**
- *@brief Builder to build options for `writer_csv()`.
+ *@brief Builder to build options for `write_csv()`.
  */
 class csv_writer_options_builder;
 

@@ -1,10 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 cimport pylibcudf.libcudf.types as libcudf_types
 
@@ -23,16 +19,8 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.io.csv cimport (
-    csv_writer_options,
-    write_csv as cpp_write_csv,
-)
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport sink_info
-from pylibcudf.libcudf.table.table_view cimport table_view
-
-from cudf._lib.io.utils cimport make_sink_info
-from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
+from cudf._lib.utils cimport data_from_pylibcudf_io
+from cudf._lib.utils import _dtype_to_names_list
 
 import pylibcudf as plc
 
@@ -318,59 +306,49 @@ def write_csv(
     --------
     cudf.to_csv
     """
-    cdef table_view input_table_view = table_view_from_table(
-        table, not index
-    )
-    cdef bool include_header_c = header
-    cdef char delim_c = ord(sep)
-    cdef string line_term_c = lineterminator.encode()
-    cdef string na_c = na_rep.encode()
-    cdef int rows_per_chunk_c = rows_per_chunk
-    cdef vector[string] col_names
-    cdef string true_value_c = 'True'.encode()
-    cdef string false_value_c = 'False'.encode()
-    cdef unique_ptr[data_sink] data_sink_c
-    cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-
-    if header is True:
-        all_names = columns_apply_na_rep(table._column_names, na_rep)
-        if index is True:
-            all_names = table._index.names + all_names
-
-        if len(all_names) > 0:
-            col_names.reserve(len(all_names))
-            if len(all_names) == 1:
-                if all_names[0] in (None, ''):
-                    col_names.push_back('""'.encode())
-                else:
-                    col_names.push_back(
-                        str(all_names[0]).encode()
-                    )
-            else:
-                for idx, col_name in enumerate(all_names):
-                    if col_name is None:
-                        col_names.push_back(''.encode())
-                    else:
-                        col_names.push_back(
-                            str(col_name).encode()
-                        )
-
-    cdef csv_writer_options options = move(
-        csv_writer_options.builder(sink_info_c, input_table_view)
-        .names(col_names)
-        .na_rep(na_c)
-        .include_header(include_header_c)
-        .rows_per_chunk(rows_per_chunk_c)
-        .line_terminator(line_term_c)
-        .inter_column_delimiter(delim_c)
-        .true_value(true_value_c)
-        .false_value(false_value_c)
-        .build()
-    )
-
+    columns=[]
+    index_and_not_empty = index is True and table.index is not None
+    if index_and_not_empty:
+        columns.extend(col.to_pylibcudf(mode="read") for col in table.index._columns)
+    columns.extend(col.to_pylibcudf(mode="read") for col in table._columns)
+    if header:
+        all_names = []
+        if index_and_not_empty:
+            all_names.extend(table.index.names)
+        all_names.extend(
+            na_rep if name is None or pd.isnull(name)
+            else name for name in table._column_names
+        )
+        col_names = [
+            '""' if (name in (None, '') and len(all_names) == 1)
+            else (str(name) if name not in (None, '') else '')
+            for name in all_names
+        ]
+    else:
+        col_names = []
+    num_index_columns = len(table._index.names) if index_and_not_empty else 0
+    col_names_and_child_col_names = [
+        (
+            name,
+            _dtype_to_names_list(
+                table[table._column_names[i - num_index_columns]]._column
+            ) if i >= num_index_columns else []
+        )
+        for i, name in enumerate(col_names)
+    ]
     try:
-        with nogil:
-            cpp_write_csv(options)
+        plc.io.csv.write_csv(
+            plc.io.SinkInfo([path_or_buf]),
+            plc.io.TableWithMetadata(
+                plc.Table(columns),
+                col_names_and_child_col_names
+            ),
+            sep=str(sep),
+            na_rep=str(na_rep),
+            header=header,
+            lineterminator=str(lineterminator),
+            rows_per_chunk=rows_per_chunk,
+        )
     except OverflowError:
         raise OverflowError(
             f"Writing CSV file with chunksize={rows_per_chunk} failed. "
@@ -419,11 +397,3 @@ cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
 
     dtype = cudf.dtype(dtype)
     return dtype_to_pylibcudf_type(dtype)
-
-
-def columns_apply_na_rep(column_names, na_rep):
-    return tuple(
-        na_rep if pd.isnull(col_name)
-        else col_name
-        for col_name in column_names
-    )
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
@@ -16,6 +16,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport add_df_col_struct_names
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
+from cudf._lib.utils import _dtype_to_names_list
 
 import pylibcudf as plc
 
@@ -217,13 +218,3 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
             "supported in JSON reader"
         )
     return dtype_to_data_type(dtype)
-
-
-def _dtype_to_names_list(col):
-    if isinstance(col.dtype, cudf.StructDtype):
-        return [(name, _dtype_to_names_list(child))
-                for name, child in zip(col.dtype.fields, col.children)]
-    elif isinstance(col.dtype, cudf.ListDtype):
-        return [("", _dtype_to_names_list(child))
-                for child in col.children]
-    return []
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
@@ -398,3 +398,13 @@ cdef data_from_table_view(
         source_column_idx += 1
 
     return dict(zip(column_names, data_columns)), index
+
+
+def _dtype_to_names_list(col):
+    if isinstance(col.dtype, cudf.StructDtype):
+        return [(name, _dtype_to_names_list(child))
+                for name, child in zip(col.dtype.fields, col.children)]
+    elif isinstance(col.dtype, cudf.ListDtype):
+        return [("", _dtype_to_names_list(child))
+                for child in col.children]
+    return []
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.vector cimport vector
+from libcpp.string cimport string
+from libcpp cimport bool
+from pylibcudf.libcudf.io.csv cimport (
+    csv_writer_options,
+    csv_writer_options_builder,
+)
+from pylibcudf.libcudf.io.types cimport quote_style
+from pylibcudf.io.types cimport SinkInfo
+from pylibcudf cimport Table
+
+cdef class CsvWriterOptions:
+    cdef csv_writer_options c_obj
+
+    @staticmethod
+    cdef CsvWriterOptionsBuilder builder(SinkInfo sink, Table table)
+
+
+cdef class CsvWriterOptionsBuilder:
+    cdef csv_writer_options_builder c_obj
+    cpdef CsvWriterOptionsBuilder names(self, list names)
+    cpdef CsvWriterOptionsBuilder na_rep(self, str val)
+    cpdef CsvWriterOptionsBuilder include_header(self, bool val)
+    cpdef CsvWriterOptionsBuilder rows_per_chunk(self, int val)
+    cpdef CsvWriterOptionsBuilder line_terminator(self, str term)
+    cpdef CsvWriterOptionsBuilder inter_column_delimiter(self, str delim)
+    cpdef CsvWriterOptionsBuilder true_value(self, str val)
+    cpdef CsvWriterOptionsBuilder false_value(self, str val)
+    cpdef CsvWriterOptions build(self)
@@ -5,6 +5,7 @@ from collections.abc import Mapping
 from pylibcudf.io.types import (
     CompressionType,
     QuoteStyle,
+    SinkInfo,
     SourceInfo,
     TableWithMetadata,
 )
@@ -52,3 +53,13 @@ def read_csv(
     # detect_whitespace_around_quotes: bool = False,
     # timestamp_type: DataType = DataType(type_id.EMPTY),
 ) -> TableWithMetadata: ...
+def write_csv(
+    sink_info: SinkInfo,
+    table: TableWithMetadata,
+    *,
+    sep: str = ",",
+    na_rep: str = "",
+    header: bool = True,
+    lineterminator: str = "\n",
+    rows_per_chunk: int = 8,
+) -> None: ...