Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Java] Add recoverWithNull to JSONOptions and pass to Table.readJSON #14078

Merged
merged 6 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,11 +29,13 @@ public final class JSONOptions extends ColumnFilterOptions {

private final boolean dayFirst;
private final boolean lines;
private final boolean recoverWithNull;

private JSONOptions(Builder builder) {
super(builder);
dayFirst = builder.dayFirst;
lines = builder.lines;
recoverWithNull = builder.recoverWithNull;
}

public boolean isDayFirst() {
Expand All @@ -44,6 +46,11 @@ public boolean isLines() {
return lines;
}

/** Return the value of the recoverWithNull option */
public boolean isRecoverWithNull() {
razajafri marked this conversation as resolved.
Show resolved Hide resolved
return recoverWithNull;
}

@Override
String[] getIncludeColumnNames() {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand All @@ -57,6 +64,8 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean dayFirst = false;
private boolean lines = true;

private boolean recoverWithNull = false;

/**
* Whether to parse dates as DD/MM versus MM/DD
* @param dayFirst true: DD/MM, false, MM/DD
Expand All @@ -78,6 +87,20 @@ public Builder withLines(boolean perLine) {
return this;
}

/**
* Specify how to handle invalid lines when parsing json. Setting
* recoverWithNull to true will cause null values to be returned
* for invalid lines. Setting recoverWithNull to false will cause
* the parsing to fail with an exception.
*
* @param recoverWithNull true: return nulls, false: throw exception
* @return builder for chaining
*/
public Builder withRecoverWithNull(boolean recoverWithNull) {
razajafri marked this conversation as resolved.
Show resolved Hide resolved
this.recoverWithNull = recoverWithNull;
return this;
}

@Override
public Builder includeColumn(String... names) {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand Down
12 changes: 7 additions & 5 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ private static native long[] readCSV(String[] columnNames,
private static native long readJSON(String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
String filePath, long address, long length,
boolean dayFirst, boolean lines) throws CudfException;
boolean dayFirst, boolean lines,
boolean recoverWithNulls) throws CudfException;

private static native long readAndInferJSON(long address, long length,
boolean dayFirst, boolean lines) throws CudfException;
boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1047,7 +1048,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
path.getAbsolutePath(),
0, 0,
opts.isDayFirst(), opts.isLines()))) {
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {

return gatherJSONColumns(schema, twm);
}
Expand Down Expand Up @@ -1099,7 +1100,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
opts.isDayFirst(), opts.isLines()));
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
}

/**
Expand All @@ -1121,7 +1122,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert offset >= 0 && offset < buffer.length;
try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
schema.getTypeIds(), schema.getTypeScales(), null,
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) {
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
opts.isRecoverWithNull()))) {
return gatherJSONColumns(schema, twm);
}
}
Expand Down
18 changes: 14 additions & 4 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1331,7 +1331,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
}

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null) {

JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1344,9 +1345,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
static_cast<std::size_t>(buffer_length)};

auto const recovery_mode = recover_with_null ?
cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
cudf::io::json_recovery_mode_t::FAIL;
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
.dayfirst(static_cast<bool>(day_first))
.lines(static_cast<bool>(lines));
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode);

auto result =
std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
Expand Down Expand Up @@ -1404,7 +1409,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null) {

bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1448,9 +1454,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
static_cast<std::size_t>(buffer_length)} :
cudf::io::source_info{filename.get()};

cudf::io::json_recovery_mode_t recovery_mode =
recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
cudf::io::json_recovery_mode_t::FAIL;
cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
.dayfirst(static_cast<bool>(day_first))
.lines(static_cast<bool>(lines));
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode);

if (!n_col_names.is_null() && data_types.size() > 0) {
if (n_col_names.size() != n_types.size()) {
Expand Down
34 changes: 34 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ public class TableTest extends CudfTestBase {
private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");

private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
.column(DType.INT32, "A")
Expand Down Expand Up @@ -326,6 +327,39 @@ void testReadJSONFile() {
}
}

@Test
void testReadJSONFileWithInvalidLines() {
Schema schema = Schema.builder()
.column(DType.STRING, "name")
.column(DType.INT32, "age")
.build();

// test with recoverWithNulls=true
{
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withRecoverWithNull(true)
.build();
try (Table expected = new Table.TestBuilder()
.column("Michael", "Andy", null, "Justin")
.column(null, 30, null, 19)
.build();
Table table = Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE)) {
assertTablesAreEqual(expected, table);
}
}

// test with recoverWithNulls=false
{
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withRecoverWithNull(false)
.build();
assertThrows(CudfException.class, () ->
Table.readJSON(schema, opts, TEST_JSON_ERROR_FILE));
}
}

@Test
void testReadJSONFileWithDifferentColumnOrder() {
Schema schema = Schema.builder()
Expand Down
4 changes: 4 additions & 0 deletions java/src/test/resources/people_with_invalid_lines.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"name":"Michael"}
{"name":"Andy", "age":30}
this_line_is_not_valid
{"name":"Justin", "age":19}