Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce feature flags to select major arrow versions #654

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ jobs:
cargo install cargo-msrv --locked
- name: verify-msrv
run: |
cargo msrv --path kernel/ verify --all-features
cargo msrv --path derive-macros/ verify --all-features
cargo msrv --path ffi/ verify --all-features
cargo msrv --path ffi-proc-macros/ verify --all-features
cargo msrv --path kernel/ verify --features $(< .github/workflows/default-cargo-features)
cargo msrv --path derive-macros/ verify --features $(< .github/workflows/default-cargo-features)
cargo msrv --path ffi/ verify --features $(< .github/workflows/default-cargo-features)
cargo msrv --path ffi-proc-macros/ verify --features $(< .github/workflows/default-cargo-features)
msrv-run-tests:
runs-on: ubuntu-latest
steps:
Expand Down Expand Up @@ -104,7 +104,7 @@ jobs:
- name: check kernel builds with no-default-features
run: cargo build -p delta_kernel --no-default-features
- name: build and lint with clippy
run: cargo clippy --benches --tests --all-features -- -D warnings
run: cargo clippy --benches --tests --features $(< .github/workflows/default-cargo-features) -- -D warnings
- name: lint without default features
run: cargo clippy --no-default-features -- -D warnings
- name: check kernel builds with default-engine
Expand All @@ -129,7 +129,7 @@ jobs:
override: true
- uses: Swatinem/rust-cache@v2
- name: test
run: cargo test --workspace --verbose --all-features -- --skip read_table_version_hdfs
run: cargo test --workspace --verbose --features $(< .github/workflows/default-cargo-features) -- --skip read_table_version_hdfs

ffi_test:
runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -229,7 +229,7 @@ jobs:
uses: taiki-e/install-action@cargo-llvm-cov
- uses: Swatinem/rust-cache@v2
- name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
run: cargo llvm-cov --features $(< .github/workflows/default-cargo-features) --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/default-cargo-features
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
integration-test,default-engine,default-engine-rustls,cloud,arrow,sync-engine
17 changes: 1 addition & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,9 @@ license = "Apache-2.0"
repository = "https://github.com/delta-io/delta-kernel-rs"
readme = "README.md"
rust-version = "1.80"
version = "0.6.1"
version = "0.7.0"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually do the updates to version all at once during a release

Suggested change
version = "0.7.0"
version = "0.6.1"


[workspace.dependencies]
# When changing the arrow version range, also modify ffi/Cargo.toml which has
# its own arrow version ranges witeh modified features. Failure to do so will
# result in compilation errors as two different sets of arrow dependencies may
# be sourced
arrow = { version = ">=53, <55" }
arrow-arith = { version = ">=53, <55" }
arrow-array = { version = ">=53, <55" }
arrow-buffer = { version = ">=53, <55" }
arrow-cast = { version = ">=53, <55" }
arrow-data = { version = ">=53, <55" }
arrow-ord = { version = ">=53, <55" }
arrow-json = { version = ">=53, <55" }
arrow-select = { version = ">=53, <55" }
arrow-schema = { version = ">=53, <55" }
parquet = { version = ">=53, <55", features = ["object_store"] }
object_store = { version = ">=0.11, <0.12" }
hdfs-native-object-store = "0.12.0"
hdfs-native = "0.10.0"
Expand Down
7 changes: 1 addition & 6 deletions acceptance/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,14 @@ rust-version.workspace = true
release = false

[dependencies]
arrow-array = { workspace = true }
arrow-cast = { workspace = true }
arrow-ord = { workspace = true }
arrow-select = { workspace = true }
arrow-schema = { workspace = true }
delta_kernel = { path = "../kernel", features = [
"default-engine",
"arrow_53",
"developer-visibility",
] }
futures = "0.3"
itertools = "0.13"
object_store = { workspace = true }
parquet = { workspace = true }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "1"
Expand Down
20 changes: 13 additions & 7 deletions acceptance/src/data.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
use std::{path::Path, sync::Arc};

use arrow_array::{Array, RecordBatch};
use arrow_ord::sort::{lexsort_to_indices, SortColumn};
use arrow_schema::{DataType, Schema};
use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::take};
use delta_kernel::arrow::array::{Array, RecordBatch};
use delta_kernel::arrow::compute::{
concat_batches, filter_record_batch, lexsort_to_indices, take, SortColumn,
};
use delta_kernel::arrow::datatypes::{DataType, Schema};

use delta_kernel::parquet::arrow::async_reader::{
ParquetObjectReader, ParquetRecordBatchStreamBuilder,
};
use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error, Table};
use futures::{stream::TryStreamExt, StreamExt};
use itertools::Itertools;
use object_store::{local::LocalFileSystem, ObjectStore};
use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};

use crate::{TestCaseInfo, TestResult};

Expand Down Expand Up @@ -83,8 +86,11 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
if **zone == *"+00:00" {
arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
.expect("Could not cast to UTC")
delta_kernel::arrow::compute::cast(
&col,
&DataType::Timestamp(*unit, Some("UTC".into())),
)
.expect("Could not cast to UTC")
Comment on lines +89 to +93
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
delta_kernel::arrow::compute::cast(
&col,
&DataType::Timestamp(*unit, Some("UTC".into())),
)
.expect("Could not cast to UTC")
let data_type = DataType::Timestamp(*unit, Some("UTC".into()));
delta_kernel::arrow::compute::cast(&col, &data_type)
.expect("Could not cast to UTC")

} else {
col
}
Expand Down
2 changes: 1 addition & 1 deletion ffi/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ url = "2"
delta_kernel = { path = "../kernel", default-features = false, features = [
"developer-visibility",
] }
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.1" }
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.7.0" }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.7.0" }
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.1" }


# used if we use the default engine to be able to move arrow data into the c-ffi format
arrow-schema = { version = ">=53, <55", default-features = false, features = [
Expand Down
56 changes: 27 additions & 29 deletions kernel/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,29 @@ uuid = "1.10.0"
z85 = "3.0.5"

# bring in our derive macros
delta_kernel_derive = { path = "../derive-macros", version = "0.6.1" }
delta_kernel_derive = { path = "../derive-macros", version = "0.7.0" }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
delta_kernel_derive = { path = "../derive-macros", version = "0.7.0" }
delta_kernel_derive = { path = "../derive-macros", version = "0.6.1" }


# used for developer-visibility
visibility = "0.1.1"

# Used in the sync engine
tempfile = { version = "3", optional = true }

# Arrow supported versions
## 53
# Used in default engine
arrow-buffer = { workspace = true, optional = true }
arrow-array = { workspace = true, optional = true, features = ["chrono-tz"] }
arrow-select = { workspace = true, optional = true }
arrow-arith = { workspace = true, optional = true }
arrow-cast = { workspace = true, optional = true }
arrow-json = { workspace = true, optional = true }
arrow-ord = { workspace = true, optional = true }
arrow-schema = { workspace = true, optional = true }
arrow_53 = { package = "arrow", version = "53", features = ["chrono-tz", "json", "prettyprint"], optional = true }
# Used in default and sync engine
parquet_53 = { package = "parquet", version = "53", features = ["async", "object_store"] , optional = true }
######
## 54
arrow_54 = { package = "arrow", version = "54", features = ["chrono-tz", "json", "prettyprint"], optional = true }
parquet_54 = { package = "parquet", version = "54", features = ["async", "object_store"] , optional = true }
######
Comment on lines +50 to +60
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nicklan @zachschuermann A lot of the ❌ are due to the use of --all-features in the GitHub Actions invocations. Unfortunately this will need to be worked around since there are now mutually exclusive features present 😞

Crazy idea... what if we defined the features as lower bounds instead of equality matches? So if e.g. arrow_min_54 feature is requested, that forces the version to be at least arrow-54; selecting arrow_min_53 would not cause any problems in that case, because both conditions are satisfied by choosing arrow-54. Tho maybe it would serve our purposes better to define arrow_max_53 and arrow_max_54 instead (which, if both are requested, chooses arrow-53)? Then --all-features would still compile and if somebody only specified one of those it would have the desired effect?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nice idea, but I don't think cargo can support it. That is, you do have to specify something like:

arrow_53 = { package = "arrow", version = "53", features = ["chrono-tz", "json", "prettyprint"], optional = true }
arrow_54 = { package = "arrow", version = "54", features = ["chrono-tz", "json", "prettyprint"], optional = true }

Somewhere to specify the optional dependencies. So now if we want a feature like arrow_max_54, we need to have that depend on.... something. What exactly it would depend on would depend on what other flags you had enabled, but afaik cargo does not support logic in the toml so you can't unify the flags and pick the right dependency.

Copy link
Collaborator

@scovich scovich Jan 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two parts to this trick:

  • cargo.toml brings in arrow@53 as arrow_53 etc. This effectively bifurcates the arrow crate into multiple apparent crates, and would remain unchanged.
  • the extern crate trickery that decides which of the available versions to map back to arrow crate name. This is the part that might use the lower/upper bound concept.

But in retrospect, you're probably right... we can't use the original names to make decisions for the extern crate decl, and if we want e.g. arrow_max_54 to potentially point to arrow-53, then we'd have to bring both arrow-54 and arrow-53 as dependencies, which at last partly defeats the purpose of splitting them out.

Maybe it still works, tho? Technically both arrow versions would be dependencies, but only the one mapped in as an extern crate would actually get used by any kernel code?

(ugh, cargo... this shouldn't be so difficult!)


futures = { version = "0.3", optional = true }
object_store = { workspace = true, optional = true }
hdfs-native-object-store = { workspace = true, optional = true }
# Used in default and sync engine
parquet = { workspace = true, optional = true }
# Used for fetching direct urls (like pre-signed urls)
reqwest = { version = "0.12.8", default-features = false, optional = true }
strum = { version = "0.26", features = ["derive"] }
Expand All @@ -73,8 +75,20 @@ hdfs-native = { workspace = true, optional = true }
walkdir = { workspace = true, optional = true }

[features]
arrow-conversion = ["arrow-schema"]
arrow-expression = ["arrow-arith", "arrow-array", "arrow-buffer", "arrow-ord", "arrow-schema"]
# The default version to be expected
arrow = ["arrow_53"]
# The default version to be expected
parquet = ["parquet_53"]

arrow_53 = ["dep:arrow_53", "parquet_53"]
parquet_53 = ["dep:parquet_53"]

arrow_54 = ["dep:arrow_54", "parquet_54"]
parquet_54 = ["dep:parquet_54"]

arrow-conversion = []
arrow-expression = []

cloud = [
"object_store/aws",
"object_store/azure",
Expand All @@ -89,16 +103,8 @@ default = []
default-engine-base = [
"arrow-conversion",
"arrow-expression",
"arrow-array",
"arrow-buffer",
"arrow-cast",
"arrow-json",
"arrow-schema",
"arrow-select",
"futures",
"object_store",
"parquet/async",
"parquet/object_store",
"tokio",
"uuid/v4",
"uuid/fast-rng",
Expand All @@ -119,13 +125,6 @@ default-engine-rustls = [

developer-visibility = []
sync-engine = [
"arrow-cast",
"arrow-conversion",
"arrow-expression",
"arrow-array",
"arrow-json",
"arrow-select",
"parquet",
"tempfile",
]
integration-test = [
Expand All @@ -141,7 +140,6 @@ version = "=0.5.9"
rustc_version = "0.4.1"

[dev-dependencies]
arrow = { workspace = true, features = ["json", "prettyprint"] }
delta_kernel = { path = ".", features = ["default-engine", "sync-engine"] }
test_utils = { path = "../test-utils" }
paste = "1.0"
Expand Down
4 changes: 2 additions & 2 deletions kernel/examples/inspect-table/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ edition = "2021"
publish = false

[dependencies]
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
arrow = "53"
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"cloud",
"arrow_53",
"default-engine",
"developer-visibility",
] }
Expand Down
5 changes: 2 additions & 3 deletions kernel/examples/read-table-changes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@ publish = false
release = false

[dependencies]
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
arrow = { version = "53", features = ["prettyprint"] }
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"cloud",
"arrow_53",
"default-engine",
] }
env_logger = "0.11.3"
url = "2"
itertools = "0.13"
arrow = { workspace = true, features = ["prettyprint"] }
3 changes: 2 additions & 1 deletion kernel/examples/read-table-multi-threaded/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ edition = "2021"
publish = false

[dependencies]
arrow = { workspace = true, features = ["prettyprint", "chrono-tz"] }
arrow = { version = "53", features = ["prettyprint", "chrono-tz"] }
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"cloud",
"arrow_53",
"default-engine",
"sync-engine",
"developer-visibility",
Expand Down
3 changes: 2 additions & 1 deletion kernel/examples/read-table-single-threaded/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ edition = "2021"
publish = false

[dependencies]
arrow = { workspace = true, features = ["prettyprint", "chrono-tz"] }
arrow = { version = "53", features = ["prettyprint", "chrono-tz"] }
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"arrow_53",
"cloud",
"default-engine",
"sync-engine",
Expand Down
4 changes: 2 additions & 2 deletions kernel/src/actions/visitors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,8 @@ pub(crate) fn visit_deletion_vector_at<'a>(
mod tests {
use std::sync::Arc;

use arrow_array::{RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use crate::arrow::array::{RecordBatch, StringArray};
use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};

use super::*;
use crate::{
Expand Down
11 changes: 11 additions & 0 deletions kernel/src/arrow.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//! This module exists to help re-export the version of arrow used by default-gengine and other
//! parts of kernel that need arrow
Comment on lines +1 to +2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
//! This module exists to help re-export the version of arrow used by default-gengine and other
//! parts of kernel that need arrow
//! This module exists to re-export the version of arrow used by default-engine and other
//! parts of kernel that need arrow

#[cfg(all(feature = "arrow_53", feature = "arrow_54"))]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oof, this #cfg is gonna get ugly as the number of supported arrow versions goes up. we can probably make it a little better by not using all in that case, and having multiple checks, but it's still not gonna be great. I don't think there's much better than that though sadly.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought the plan was to only support 2-3 versions max at a time? Tho even 3 already makes this pretty ugly... and agree we'd need to split it into multiple checks, e.g. all(v1, v2); all(v1, v3); all(v2, v3).

compile_error!("Multiple versions of the arrow cannot be used at the same time!");

#[cfg(feature = "arrow_53")]
pub use arrow_53::*;

#[cfg(feature = "arrow_54")]
pub use arrow_54::*;
5 changes: 3 additions & 2 deletions kernel/src/engine/arrow_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
use std::sync::Arc;

use arrow_schema::{
ArrowError, DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
use crate::arrow::datatypes::{
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
SchemaRef as ArrowSchemaRef, TimeUnit,
};
use crate::arrow::error::ArrowError;
use itertools::Itertools;

use crate::error::Error;
Expand Down
14 changes: 7 additions & 7 deletions kernel/src/engine/arrow_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ use crate::engine_data::{EngineData, EngineList, EngineMap, GetData, RowVisitor}
use crate::schema::{ColumnName, DataType};
use crate::{DeltaResult, Error};

use arrow_array::cast::AsArray;
use arrow_array::types::{Int32Type, Int64Type};
use arrow_array::{Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, StructArray};
use arrow_schema::{FieldRef, DataType as ArrowDataType};
use tracing::{debug};
use crate::arrow::array::cast::AsArray;
use crate::arrow::array::types::{Int32Type, Int64Type};
use crate::arrow::array::{Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, StructArray};
use crate::arrow::datatypes::{FieldRef, DataType as ArrowDataType};
use tracing::debug;

use std::collections::{HashMap, HashSet};

Expand Down Expand Up @@ -269,8 +269,8 @@ impl ArrowEngineData {
mod tests {
use std::sync::Arc;

use arrow_array::{RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use crate::arrow::array::{RecordBatch, StringArray};
use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};

use crate::{
actions::{get_log_schema, Metadata, Protocol},
Expand Down
Loading
Loading