Skip to content

Commit

Permalink
Fix reading of single-row unterminated CSV files (#17305)
Browse files Browse the repository at this point in the history
Fixed the logic in the CSV reader that led to empty output instead of producing a table with a single column and one row. 
Added tests to make sure the new logic does not cause regressions.
Also did some small clean up around the fix.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: #17305
  • Loading branch information
vuule authored Nov 14, 2024
1 parent d93c3fc commit a7194f6
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 24 deletions.
42 changes: 18 additions & 24 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,62 +118,56 @@ string removeQuotes(string str, char quotechar)
}

/**
* @brief Parse the first row to set the column names in the raw_csv parameter.
* The first row can be either the header row, or the first data row
* @brief Parse a row of input to get the column names. The row can either be the header, or the
* first data row. If the header is not used, column names are generated automatically.
*/
std::vector<std::string> get_column_names(std::vector<char> const& header,
std::vector<std::string> get_column_names(std::vector<char> const& row,
parse_options_view const& parse_opts,
int header_row,
std::string prefix)
{
std::vector<std::string> col_names;

// If there is only a single character then it would be the terminator
if (header.size() <= 1) { return col_names; }

std::vector<char> first_row = header;
// Empty row, return empty column names vector
if (row.empty()) { return {}; }

std::vector<std::string> col_names;
bool quotation = false;
for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
for (size_t pos = 0, prev = 0; pos < row.size(); ++pos) {
// Flip the quotation flag if current character is a quotechar
if (first_row[pos] == parse_opts.quotechar) {
quotation = !quotation;
}
if (row[pos] == parse_opts.quotechar) { quotation = !quotation; }
// Check if end of a column/row
else if (pos == first_row.size() - 1 ||
(!quotation && first_row[pos] == parse_opts.terminator) ||
(!quotation && first_row[pos] == parse_opts.delimiter)) {
if (pos == row.size() - 1 || (!quotation && row[pos] == parse_opts.terminator) ||
(!quotation && row[pos] == parse_opts.delimiter)) {
// This is the header, add the column name
if (header_row >= 0) {
// Include the current character, in case the line is not terminated
int col_name_len = pos - prev + 1;
// Exclude the delimiter/terminator is present
if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) {
if (row[pos] == parse_opts.delimiter || row[pos] == parse_opts.terminator) {
--col_name_len;
}
// Also exclude '\r' character at the end of the column name if it's
// part of the terminator
if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' &&
first_row[pos - 1] == '\r') {
if (col_name_len > 0 && parse_opts.terminator == '\n' && row[pos] == '\n' &&
row[pos - 1] == '\r') {
--col_name_len;
}

string const new_col_name(first_row.data() + prev, col_name_len);
string const new_col_name(row.data() + prev, col_name_len);
col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
} else {
// This is the first data row, add the automatically generated name
col_names.push_back(prefix + std::to_string(col_names.size()));
}

// Stop parsing when we hit the line terminator; relevant when there is
// a blank line following the header. In this case, first_row includes
// a blank line following the header. In this case, row includes
// multiple line terminators at the end, as the new recStart belongs to
// a line that comes after the blank line(s)
if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
if (!quotation && row[pos] == parse_opts.terminator) { break; }

// Skip adjacent delimiters if delim_whitespace is set
while (parse_opts.multi_delimiter && pos < first_row.size() &&
first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) {
while (parse_opts.multi_delimiter && pos < row.size() && row[pos] == parse_opts.delimiter &&
row[pos + 1] == parse_opts.delimiter) {
++pos;
}
prev = pos + 1;
Expand Down
17 changes: 17 additions & 0 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2277,3 +2277,20 @@ def test_read_header_none_pandas_compat_column_type():
result = cudf.read_csv(StringIO(data), header=None).columns
expected = pd.read_csv(StringIO(data), header=None).columns
pd.testing.assert_index_equal(result, expected, exact=True)


@pytest.mark.parametrize("buffer", ["1", '"one"'])
def test_read_single_unterminated_row(buffer):
gdf = cudf.read_csv(StringIO(buffer), header=None)
assert_eq(gdf.shape, (1, 1))


@pytest.mark.parametrize("buffer", ["\n", "\r\n"])
def test_read_empty_only_row(buffer):
gdf = cudf.read_csv(StringIO(buffer), header=None)
assert_eq(gdf.shape, (0, 0))


def test_read_empty_only_row_custom_terminator():
gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*")
assert_eq(gdf.shape, (0, 0))

0 comments on commit a7194f6

Please sign in to comment.