From a7194f612adfb5ab9591d5f1bfb5bde4efe97eb7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 14 Nov 2024 11:37:46 -0800 Subject: [PATCH] Fix reading of single-row unterminated CSV files (#17305) Fixed the logic in the CSV reader that led to empty output instead of producing a table with a single column and one row. Added tests to make sure the new logic does not cause regressions. Also did some small clean up around the fix. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17305 --- cpp/src/io/csv/reader_impl.cu | 42 +++++++++++++----------------- python/cudf/cudf/tests/test_csv.py | 17 ++++++++++++ 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 72fca75c56b..6c84b53db46 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -118,47 +118,41 @@ string removeQuotes(string str, char quotechar) } /** - * @brief Parse the first row to set the column names in the raw_csv parameter. - * The first row can be either the header row, or the first data row + * @brief Parse a row of input to get the column names. The row can either be the header, or the + * first data row. If the header is not used, column names are generated automatically. */ -std::vector get_column_names(std::vector const& header, +std::vector get_column_names(std::vector const& row, parse_options_view const& parse_opts, int header_row, std::string prefix) { - std::vector col_names; - - // If there is only a single character then it would be the terminator - if (header.size() <= 1) { return col_names; } - - std::vector first_row = header; + // Empty row, return empty column names vector + if (row.empty()) { return {}; } + std::vector col_names; bool quotation = false; - for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) { + for (size_t pos = 0, prev = 0; pos < row.size(); ++pos) { // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == parse_opts.quotechar) { - quotation = !quotation; - } + if (row[pos] == parse_opts.quotechar) { quotation = !quotation; } // Check if end of a column/row - else if (pos == first_row.size() - 1 || - (!quotation && first_row[pos] == parse_opts.terminator) || - (!quotation && first_row[pos] == parse_opts.delimiter)) { + if (pos == row.size() - 1 || (!quotation && row[pos] == parse_opts.terminator) || + (!quotation && row[pos] == parse_opts.delimiter)) { // This is the header, add the column name if (header_row >= 0) { // Include the current character, in case the line is not terminated int col_name_len = pos - prev + 1; // Exclude the delimiter/terminator is present - if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) { + if (row[pos] == parse_opts.delimiter || row[pos] == parse_opts.terminator) { --col_name_len; } // Also exclude '\r' character at the end of the column name if it's // part of the terminator - if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' && - first_row[pos - 1] == '\r') { + if (col_name_len > 0 && parse_opts.terminator == '\n' && row[pos] == '\n' && + row[pos - 1] == '\r') { --col_name_len; } - string const new_col_name(first_row.data() + prev, col_name_len); + string const new_col_name(row.data() + prev, col_name_len); col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar)); } else { // This is the first data row, add the automatically generated name @@ -166,14 +160,14 @@ std::vector get_column_names(std::vector const& header, } // Stop parsing when we hit the line terminator; relevant when there is - // a blank line following the header. In this case, first_row includes + // a blank line following the header. In this case, row includes // multiple line terminators at the end, as the new recStart belongs to // a line that comes after the blank line(s) - if (!quotation && first_row[pos] == parse_opts.terminator) { break; } + if (!quotation && row[pos] == parse_opts.terminator) { break; } // Skip adjacent delimiters if delim_whitespace is set - while (parse_opts.multi_delimiter && pos < first_row.size() && - first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) { + while (parse_opts.multi_delimiter && pos < row.size() && row[pos] == parse_opts.delimiter && + row[pos + 1] == parse_opts.delimiter) { ++pos; } prev = pos + 1; diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 8800275bf67..ac772c47e3a 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -2277,3 +2277,20 @@ def test_read_header_none_pandas_compat_column_type(): result = cudf.read_csv(StringIO(data), header=None).columns expected = pd.read_csv(StringIO(data), header=None).columns pd.testing.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize("buffer", ["1", '"one"']) +def test_read_single_unterminated_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (1, 1)) + + +@pytest.mark.parametrize("buffer", ["\n", "\r\n"]) +def test_read_empty_only_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (0, 0)) + + +def test_read_empty_only_row_custom_terminator(): + gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*") + assert_eq(gdf.shape, (0, 0))