From a7194f612adfb5ab9591d5f1bfb5bde4efe97eb7 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 14 Nov 2024 11:37:46 -0800
Subject: [PATCH] Fix reading of single-row unterminated CSV files (#17305)

Fixed the logic in the CSV reader that led to empty output instead of producing a table with a single column and one row.
Added tests to make sure the new logic does not cause regressions.
Also did some small clean up around the fix.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17305
---
 cpp/src/io/csv/reader_impl.cu      | 42 +++++++++++++-----------------
 python/cudf/cudf/tests/test_csv.py | 17 ++++++++++++
 2 files changed, 35 insertions(+), 24 deletions(-)
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 72fca75c56b..6c84b53db46 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -118,47 +118,41 @@ string removeQuotes(string str, char quotechar)
 }
 
 /**
- * @brief Parse the first row to set the column names in the raw_csv parameter.
- * The first row can be either the header row, or the first data row
+ * @brief Parse a row of input to get the column names. The row can either be the header, or the
+ * first data row. If the header is not used, column names are generated automatically.
  */
-std::vector<std::string> get_column_names(std::vector<char> const& header,
+std::vector<std::string> get_column_names(std::vector<char> const& row,
                                           parse_options_view const& parse_opts,
                                           int header_row,
                                           std::string prefix)
 {
-  std::vector<std::string> col_names;
-
-  // If there is only a single character then it would be the terminator
-  if (header.size() <= 1) { return col_names; }
-
-  std::vector<char> first_row = header;
+  // Empty row, return empty column names vector
+  if (row.empty()) { return {}; }
 
+  std::vector<std::string> col_names;
   bool quotation = false;
-  for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
+  for (size_t pos = 0, prev = 0; pos < row.size(); ++pos) {
     // Flip the quotation flag if current character is a quotechar
-    if (first_row[pos] == parse_opts.quotechar) {
-      quotation = !quotation;
-    }
+    if (row[pos] == parse_opts.quotechar) { quotation = !quotation; }
     // Check if end of a column/row
-    else if (pos == first_row.size() - 1 ||
-             (!quotation && first_row[pos] == parse_opts.terminator) ||
-             (!quotation && first_row[pos] == parse_opts.delimiter)) {
+    if (pos == row.size() - 1 || (!quotation && row[pos] == parse_opts.terminator) ||
+        (!quotation && row[pos] == parse_opts.delimiter)) {
       // This is the header, add the column name
       if (header_row >= 0) {
         // Include the current character, in case the line is not terminated
         int col_name_len = pos - prev + 1;
         // Exclude the delimiter/terminator is present
-        if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) {
+        if (row[pos] == parse_opts.delimiter || row[pos] == parse_opts.terminator) {
           --col_name_len;
         }
         // Also exclude '\r' character at the end of the column name if it's
         // part of the terminator
-        if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' &&
-            first_row[pos - 1] == '\r') {
+        if (col_name_len > 0 && parse_opts.terminator == '\n' && row[pos] == '\n' &&
+            row[pos - 1] == '\r') {
           --col_name_len;
         }
 
-        string const new_col_name(first_row.data() + prev, col_name_len);
+        string const new_col_name(row.data() + prev, col_name_len);
         col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
       } else {
         // This is the first data row, add the automatically generated name
@@ -166,14 +160,14 @@ std::vector<std::string> get_column_names(std::vector<char> const& header,
       }
 
       // Stop parsing when we hit the line terminator; relevant when there is
-      // a blank line following the header. In this case, first_row includes
+      // a blank line following the header. In this case, row includes
       // multiple line terminators at the end, as the new recStart belongs to
       // a line that comes after the blank line(s)
-      if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
+      if (!quotation && row[pos] == parse_opts.terminator) { break; }
 
       // Skip adjacent delimiters if delim_whitespace is set
-      while (parse_opts.multi_delimiter && pos < first_row.size() &&
-             first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) {
+      while (parse_opts.multi_delimiter && pos < row.size() && row[pos] == parse_opts.delimiter &&
+             row[pos + 1] == parse_opts.delimiter) {
         ++pos;
       }
       prev = pos + 1;
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 8800275bf67..ac772c47e3a 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -2277,3 +2277,20 @@ def test_read_header_none_pandas_compat_column_type():
         result = cudf.read_csv(StringIO(data), header=None).columns
     expected = pd.read_csv(StringIO(data), header=None).columns
     pd.testing.assert_index_equal(result, expected, exact=True)
+
+
+@pytest.mark.parametrize("buffer", ["1", '"one"'])
+def test_read_single_unterminated_row(buffer):
+    gdf = cudf.read_csv(StringIO(buffer), header=None)
+    assert_eq(gdf.shape, (1, 1))
+
+
+@pytest.mark.parametrize("buffer", ["\n", "\r\n"])
+def test_read_empty_only_row(buffer):
+    gdf = cudf.read_csv(StringIO(buffer), header=None)
+    assert_eq(gdf.shape, (0, 0))
+
+
+def test_read_empty_only_row_custom_terminator():
+    gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*")
+    assert_eq(gdf.shape, (0, 0))