Skip to content

Commit

Permalink
refactor: Make [non] nullable struct fields easier to create (#646)
Browse files Browse the repository at this point in the history
## What changes are proposed in this pull request?

A lot of code (especially in tests) calls `StructField::new` with
literal values of the `nullable: bool` argument. Booleans are easy to
misinterpret (which value means non-null field?) -- and it's hard to
read when the third arg is split to its own line in nested expressions
such as:
```rust
        StructField::new(
            "fileConstantValues",
            StructType::new([StructField::new(
                "partitionValues",
                MapType::new(DataType::STRING, DataType::STRING, true),
                false,
            )]),
            true,
        ),
```

To improve readability and make the code less error-prone, define two
new helper methods/constructors for `StructField`: `nullable` and
`not_null`, which create struct fields having the corresponding
nullability.

## How was this change tested?

No new functionality, and all existing unit tests still pass.

To minimize the risk of unfaithful refactoring, the change was made in
four steps:
1. Use a multi-file regexp search/replace to convert simple code such
like this:
    ```rust
    StructField::new("a", DataType::LONG, true)
    ```
    to this: 
    ```rust
    StructField::nullable("a", DataType::LONG)
    ```
The exact expression used was: `StructField::new(\([^()]*\), true) →
StructField::nullable(\1)`, which ignores any constructor call
containing parentheses, to avoid ambiguity.
2. Use the multi-file regexp search/replace
`StructField::new(\([^()]*\), false) → StructField::not_null(\1)`, to
convert simple use `not_null` call sites (see above for details).
3. Use an interactive multi-file search/replace `StructField::new →
StructField::nullable`, relying on IDE parentheses matching to identify
calls that pass the literal `true` (first pass). As a safety measure,
the resulting code is compiled; all changed call sites fail to compile
because of the (now unrecognized) third arg, which can then be deleted
after verifying it is the literal `true`.
4. Use the same two-pass process for `StructField::new →
StructField::not_null` with literal `false`.

Each step is its own commit, for easier verification.
  • Loading branch information
scovich authored Jan 16, 2025
1 parent 616e9ac commit 76c65c8
Show file tree
Hide file tree
Showing 18 changed files with 258 additions and 307 deletions.
7 changes: 3 additions & 4 deletions ffi/src/test_ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,17 @@ pub unsafe extern "C" fn get_testing_kernel_expression() -> Handle<SharedExpress
let array_data = ArrayData::new(array_type.clone(), vec![Scalar::Short(5), Scalar::Short(0)]);

let nested_fields = vec![
StructField::new("a", DataType::INTEGER, false),
StructField::new("b", array_type, false),
StructField::not_null("a", DataType::INTEGER),
StructField::not_null("b", array_type),
];
let nested_values = vec![Scalar::Integer(500), Scalar::Array(array_data.clone())];
let nested_struct = StructData::try_new(nested_fields.clone(), nested_values).unwrap();
let nested_struct_type = StructType::new(nested_fields);

let top_level_struct = StructData::try_new(
vec![StructField::new(
vec![StructField::nullable(
"top",
DataType::Struct(Box::new(nested_struct_type)),
true,
)],
vec![Scalar::Struct(nested_struct)],
)
Expand Down
4 changes: 2 additions & 2 deletions integration-tests/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ fn create_arrow_schema() -> arrow::datatypes::Schema {

fn create_kernel_schema() -> delta_kernel::schema::Schema {
use delta_kernel::schema::{DataType, Schema, StructField};
let field_a = StructField::new("a", DataType::LONG, false);
let field_b = StructField::new("b", DataType::BOOLEAN, false);
let field_a = StructField::not_null("a", DataType::LONG);
let field_b = StructField::not_null("b", DataType::BOOLEAN);
Schema::new(vec![field_a, field_b])
}

Expand Down
129 changes: 54 additions & 75 deletions kernel/src/actions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -524,38 +524,30 @@ mod tests {
.project(&[METADATA_NAME])
.expect("Couldn't get metaData field");

let expected = Arc::new(StructType::new([StructField::new(
let expected = Arc::new(StructType::new([StructField::nullable(
"metaData",
StructType::new([
StructField::new("id", DataType::STRING, false),
StructField::new("name", DataType::STRING, true),
StructField::new("description", DataType::STRING, true),
StructField::new(
StructField::not_null("id", DataType::STRING),
StructField::nullable("name", DataType::STRING),
StructField::nullable("description", DataType::STRING),
StructField::not_null(
"format",
StructType::new([
StructField::new("provider", DataType::STRING, false),
StructField::new(
StructField::not_null("provider", DataType::STRING),
StructField::not_null(
"options",
MapType::new(DataType::STRING, DataType::STRING, false),
false,
),
]),
false,
),
StructField::new("schemaString", DataType::STRING, false),
StructField::new(
"partitionColumns",
ArrayType::new(DataType::STRING, false),
false,
),
StructField::new("createdTime", DataType::LONG, true),
StructField::new(
StructField::not_null("schemaString", DataType::STRING),
StructField::not_null("partitionColumns", ArrayType::new(DataType::STRING, false)),
StructField::nullable("createdTime", DataType::LONG),
StructField::not_null(
"configuration",
MapType::new(DataType::STRING, DataType::STRING, false),
false,
),
]),
true,
)]));
assert_eq!(schema, expected);
}
Expand All @@ -566,61 +558,55 @@ mod tests {
.project(&[ADD_NAME])
.expect("Couldn't get add field");

let expected = Arc::new(StructType::new([StructField::new(
let expected = Arc::new(StructType::new([StructField::nullable(
"add",
StructType::new([
StructField::new("path", DataType::STRING, false),
StructField::new(
StructField::not_null("path", DataType::STRING),
StructField::not_null(
"partitionValues",
MapType::new(DataType::STRING, DataType::STRING, true),
false,
),
StructField::new("size", DataType::LONG, false),
StructField::new("modificationTime", DataType::LONG, false),
StructField::new("dataChange", DataType::BOOLEAN, false),
StructField::new("stats", DataType::STRING, true),
StructField::new(
StructField::not_null("size", DataType::LONG),
StructField::not_null("modificationTime", DataType::LONG),
StructField::not_null("dataChange", DataType::BOOLEAN),
StructField::nullable("stats", DataType::STRING),
StructField::nullable(
"tags",
MapType::new(DataType::STRING, DataType::STRING, false),
true,
),
deletion_vector_field(),
StructField::new("baseRowId", DataType::LONG, true),
StructField::new("defaultRowCommitVersion", DataType::LONG, true),
StructField::new("clusteringProvider", DataType::STRING, true),
StructField::nullable("baseRowId", DataType::LONG),
StructField::nullable("defaultRowCommitVersion", DataType::LONG),
StructField::nullable("clusteringProvider", DataType::STRING),
]),
true,
)]));
assert_eq!(schema, expected);
}

fn tags_field() -> StructField {
StructField::new(
StructField::nullable(
"tags",
MapType::new(DataType::STRING, DataType::STRING, false),
true,
)
}

fn partition_values_field() -> StructField {
StructField::new(
StructField::nullable(
"partitionValues",
MapType::new(DataType::STRING, DataType::STRING, false),
true,
)
}

fn deletion_vector_field() -> StructField {
StructField::new(
StructField::nullable(
"deletionVector",
DataType::struct_type([
StructField::new("storageType", DataType::STRING, false),
StructField::new("pathOrInlineDv", DataType::STRING, false),
StructField::new("offset", DataType::INTEGER, true),
StructField::new("sizeInBytes", DataType::INTEGER, false),
StructField::new("cardinality", DataType::LONG, false),
StructField::not_null("storageType", DataType::STRING),
StructField::not_null("pathOrInlineDv", DataType::STRING),
StructField::nullable("offset", DataType::INTEGER),
StructField::not_null("sizeInBytes", DataType::INTEGER),
StructField::not_null("cardinality", DataType::LONG),
]),
true,
)
}

Expand All @@ -629,21 +615,20 @@ mod tests {
let schema = get_log_schema()
.project(&[REMOVE_NAME])
.expect("Couldn't get remove field");
let expected = Arc::new(StructType::new([StructField::new(
let expected = Arc::new(StructType::new([StructField::nullable(
"remove",
StructType::new([
StructField::new("path", DataType::STRING, false),
StructField::new("deletionTimestamp", DataType::LONG, true),
StructField::new("dataChange", DataType::BOOLEAN, false),
StructField::new("extendedFileMetadata", DataType::BOOLEAN, true),
StructField::not_null("path", DataType::STRING),
StructField::nullable("deletionTimestamp", DataType::LONG),
StructField::not_null("dataChange", DataType::BOOLEAN),
StructField::nullable("extendedFileMetadata", DataType::BOOLEAN),
partition_values_field(),
StructField::new("size", DataType::LONG, true),
StructField::nullable("size", DataType::LONG),
tags_field(),
deletion_vector_field(),
StructField::new("baseRowId", DataType::LONG, true),
StructField::new("defaultRowCommitVersion", DataType::LONG, true),
StructField::nullable("baseRowId", DataType::LONG),
StructField::nullable("defaultRowCommitVersion", DataType::LONG),
]),
true,
)]));
assert_eq!(schema, expected);
}
Expand All @@ -653,20 +638,18 @@ mod tests {
let schema = get_log_schema()
.project(&[CDC_NAME])
.expect("Couldn't get remove field");
let expected = Arc::new(StructType::new([StructField::new(
let expected = Arc::new(StructType::new([StructField::nullable(
"cdc",
StructType::new([
StructField::new("path", DataType::STRING, false),
StructField::new(
StructField::not_null("path", DataType::STRING),
StructField::not_null(
"partitionValues",
MapType::new(DataType::STRING, DataType::STRING, true),
false,
),
StructField::new("size", DataType::LONG, false),
StructField::new("dataChange", DataType::BOOLEAN, false),
StructField::not_null("size", DataType::LONG),
StructField::not_null("dataChange", DataType::BOOLEAN),
tags_field(),
]),
true,
)]));
assert_eq!(schema, expected);
}
Expand All @@ -677,14 +660,13 @@ mod tests {
.project(&["txn"])
.expect("Couldn't get transaction field");

let expected = Arc::new(StructType::new([StructField::new(
let expected = Arc::new(StructType::new([StructField::nullable(
"txn",
StructType::new([
StructField::new("appId", DataType::STRING, false),
StructField::new("version", DataType::LONG, false),
StructField::new("lastUpdated", DataType::LONG, true),
StructField::not_null("appId", DataType::STRING),
StructField::not_null("version", DataType::LONG),
StructField::nullable("lastUpdated", DataType::LONG),
]),
true,
)]));
assert_eq!(schema, expected);
}
Expand All @@ -695,25 +677,22 @@ mod tests {
.project(&["commitInfo"])
.expect("Couldn't get commitInfo field");

let expected = Arc::new(StructType::new(vec![StructField::new(
let expected = Arc::new(StructType::new(vec![StructField::nullable(
"commitInfo",
StructType::new(vec![
StructField::new("timestamp", DataType::LONG, true),
StructField::new("inCommitTimestamp", DataType::LONG, true),
StructField::new("operation", DataType::STRING, true),
StructField::new(
StructField::nullable("timestamp", DataType::LONG),
StructField::nullable("inCommitTimestamp", DataType::LONG),
StructField::nullable("operation", DataType::STRING),
StructField::nullable(
"operationParameters",
MapType::new(DataType::STRING, DataType::STRING, false),
true,
),
StructField::new("kernelVersion", DataType::STRING, true),
StructField::new(
StructField::nullable("kernelVersion", DataType::STRING),
StructField::nullable(
"engineCommitInfo",
MapType::new(DataType::STRING, DataType::STRING, false),
true,
),
]),
true,
)]));
assert_eq!(schema, expected);
}
Expand Down
6 changes: 3 additions & 3 deletions kernel/src/actions/schemas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,20 +84,20 @@ pub(crate) trait GetNullableContainerStructField {
// nullable values
impl<T: ToNullableContainerType> GetNullableContainerStructField for T {
fn get_nullable_container_struct_field(name: impl Into<String>) -> StructField {
StructField::new(name, T::to_nullable_container_type(), false)
StructField::not_null(name, T::to_nullable_container_type())
}
}

// Normal types produce non-nullable fields
impl<T: ToDataType> GetStructField for T {
fn get_struct_field(name: impl Into<String>) -> StructField {
StructField::new(name, T::to_data_type(), false)
StructField::not_null(name, T::to_data_type())
}
}

// Option types produce nullable fields
impl<T: ToDataType> GetStructField for Option<T> {
fn get_struct_field(name: impl Into<String>) -> StructField {
StructField::new(name, T::to_data_type(), true)
StructField::nullable(name, T::to_data_type())
}
}
3 changes: 1 addition & 2 deletions kernel/src/engine/arrow_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,7 @@ mod tests {
fn test_metadata_string_conversion() -> DeltaResult<()> {
let mut metadata = HashMap::new();
metadata.insert("description", "hello world".to_owned());
let struct_field =
StructField::new("name", DataType::STRING, false).with_metadata(metadata);
let struct_field = StructField::not_null("name", DataType::STRING).with_metadata(metadata);

let arrow_field = ArrowField::try_from(&struct_field)?;
let new_metadata = arrow_field.metadata();
Expand Down
Loading

0 comments on commit 76c65c8

Please sign in to comment.