Skip to content

Commit

Permalink
feat: introduce feature flags to select major arrow versions
Browse files Browse the repository at this point in the history
This change introduces arrow_53 and arrow_54 feature flags on kernel
which are _required_ when using default-engine or sync-engine.
Fundamentally we must push users of the crate to select their arrow
major version through flags since Cargo _will_ include multiple major
versions in the dependency tree which can cause ABI breakages when
passing around symbols such as `RecordBatch`

See #640

Signed-off-by: R. Tyler Croy <[email protected]>
  • Loading branch information
rtyler committed Jan 20, 2025
1 parent 8494126 commit 212b392
Show file tree
Hide file tree
Showing 37 changed files with 210 additions and 213 deletions.
17 changes: 1 addition & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,9 @@ license = "Apache-2.0"
repository = "https://github.com/delta-io/delta-kernel-rs"
readme = "README.md"
rust-version = "1.80"
version = "0.6.1"
version = "0.7.0"

[workspace.dependencies]
# When changing the arrow version range, also modify ffi/Cargo.toml which has
# its own arrow version ranges witeh modified features. Failure to do so will
# result in compilation errors as two different sets of arrow dependencies may
# be sourced
arrow = { version = ">=53, <55" }
arrow-arith = { version = ">=53, <55" }
arrow-array = { version = ">=53, <55" }
arrow-buffer = { version = ">=53, <55" }
arrow-cast = { version = ">=53, <55" }
arrow-data = { version = ">=53, <55" }
arrow-ord = { version = ">=53, <55" }
arrow-json = { version = ">=53, <55" }
arrow-select = { version = ">=53, <55" }
arrow-schema = { version = ">=53, <55" }
parquet = { version = ">=53, <55", features = ["object_store"] }
object_store = { version = ">=0.11, <0.12" }
hdfs-native-object-store = "0.12.0"
hdfs-native = "0.10.0"
Expand Down
7 changes: 1 addition & 6 deletions acceptance/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,14 @@ rust-version.workspace = true
release = false

[dependencies]
arrow-array = { workspace = true }
arrow-cast = { workspace = true }
arrow-ord = { workspace = true }
arrow-select = { workspace = true }
arrow-schema = { workspace = true }
delta_kernel = { path = "../kernel", features = [
"default-engine",
"arrow_53",
"developer-visibility",
] }
futures = "0.3"
itertools = "0.13"
object_store = { workspace = true }
parquet = { workspace = true }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "1"
Expand Down
20 changes: 13 additions & 7 deletions acceptance/src/data.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
use std::{path::Path, sync::Arc};

use arrow_array::{Array, RecordBatch};
use arrow_ord::sort::{lexsort_to_indices, SortColumn};
use arrow_schema::{DataType, Schema};
use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::take};
use delta_kernel::arrow::array::{Array, RecordBatch};
use delta_kernel::arrow::compute::{
concat_batches, filter_record_batch, lexsort_to_indices, take, SortColumn,
};
use delta_kernel::arrow::datatypes::{DataType, Schema};

use delta_kernel::parquet::arrow::async_reader::{
ParquetObjectReader, ParquetRecordBatchStreamBuilder,
};
use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error, Table};
use futures::{stream::TryStreamExt, StreamExt};
use itertools::Itertools;
use object_store::{local::LocalFileSystem, ObjectStore};
use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};

use crate::{TestCaseInfo, TestResult};

Expand Down Expand Up @@ -83,8 +86,11 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
if **zone == *"+00:00" {
arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
.expect("Could not cast to UTC")
delta_kernel::arrow::compute::cast(
&col,
&DataType::Timestamp(*unit, Some("UTC".into())),
)
.expect("Could not cast to UTC")
} else {
col
}
Expand Down
2 changes: 1 addition & 1 deletion ffi/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ url = "2"
delta_kernel = { path = "../kernel", default-features = false, features = [
"developer-visibility",
] }
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.1" }
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.7.0" }

# used if we use the default engine to be able to move arrow data into the c-ffi format
arrow-schema = { version = ">=53, <55", default-features = false, features = [
Expand Down
56 changes: 27 additions & 29 deletions kernel/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,29 @@ uuid = "1.10.0"
z85 = "3.0.5"

# bring in our derive macros
delta_kernel_derive = { path = "../derive-macros", version = "0.6.1" }
delta_kernel_derive = { path = "../derive-macros", version = "0.7.0" }

# used for developer-visibility
visibility = "0.1.1"

# Used in the sync engine
tempfile = { version = "3", optional = true }

# Arrow supported versions
## 53
# Used in default engine
arrow-buffer = { workspace = true, optional = true }
arrow-array = { workspace = true, optional = true, features = ["chrono-tz"] }
arrow-select = { workspace = true, optional = true }
arrow-arith = { workspace = true, optional = true }
arrow-cast = { workspace = true, optional = true }
arrow-json = { workspace = true, optional = true }
arrow-ord = { workspace = true, optional = true }
arrow-schema = { workspace = true, optional = true }
arrow_53 = { package = "arrow", version = "53", features = ["chrono-tz", "json", "prettyprint"], optional = true }
# Used in default and sync engine
parquet_53 = { package = "parquet", version = "53", features = ["async", "object_store"] , optional = true }
######
## 54
arrow_54 = { package = "arrow", version = "54", features = ["chrono-tz", "json", "prettyprint"], optional = true }
parquet_54 = { package = "parquet", version = "54", features = ["async", "object_store"] , optional = true }
######

futures = { version = "0.3", optional = true }
object_store = { workspace = true, optional = true }
hdfs-native-object-store = { workspace = true, optional = true }
# Used in default and sync engine
parquet = { workspace = true, optional = true }
# Used for fetching direct urls (like pre-signed urls)
reqwest = { version = "0.12.8", default-features = false, optional = true }
strum = { version = "0.26", features = ["derive"] }
Expand All @@ -73,8 +75,20 @@ hdfs-native = { workspace = true, optional = true }
walkdir = { workspace = true, optional = true }

[features]
arrow-conversion = ["arrow-schema"]
arrow-expression = ["arrow-arith", "arrow-array", "arrow-buffer", "arrow-ord", "arrow-schema"]
# The default version to be expected
arrow = ["arrow_53"]
# The default version to be expected
parquet = ["parquet_53"]

arrow_53 = ["dep:arrow_53", "parquet_53"]
parquet_53 = ["dep:parquet_53"]

arrow_54 = ["dep:arrow_54", "parquet_54"]
parquet_54 = ["dep:parquet_54"]

arrow-conversion = []
arrow-expression = []

cloud = [
"object_store/aws",
"object_store/azure",
Expand All @@ -89,16 +103,8 @@ default = []
default-engine-base = [
"arrow-conversion",
"arrow-expression",
"arrow-array",
"arrow-buffer",
"arrow-cast",
"arrow-json",
"arrow-schema",
"arrow-select",
"futures",
"object_store",
"parquet/async",
"parquet/object_store",
"tokio",
"uuid/v4",
"uuid/fast-rng",
Expand All @@ -119,13 +125,6 @@ default-engine-rustls = [

developer-visibility = []
sync-engine = [
"arrow-cast",
"arrow-conversion",
"arrow-expression",
"arrow-array",
"arrow-json",
"arrow-select",
"parquet",
"tempfile",
]
integration-test = [
Expand All @@ -141,7 +140,6 @@ version = "=0.5.9"
rustc_version = "0.4.1"

[dev-dependencies]
arrow = { workspace = true, features = ["json", "prettyprint"] }
delta_kernel = { path = ".", features = ["default-engine", "sync-engine"] }
test_utils = { path = "../test-utils" }
paste = "1.0"
Expand Down
4 changes: 2 additions & 2 deletions kernel/examples/inspect-table/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ edition = "2021"
publish = false

[dependencies]
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
arrow = "53"
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"cloud",
"arrow_53",
"default-engine",
"developer-visibility",
] }
Expand Down
5 changes: 2 additions & 3 deletions kernel/examples/read-table-changes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@ publish = false
release = false

[dependencies]
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
arrow = { version = "53", features = ["prettyprint"] }
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"cloud",
"arrow_53",
"default-engine",
] }
env_logger = "0.11.3"
url = "2"
itertools = "0.13"
arrow = { workspace = true, features = ["prettyprint"] }
3 changes: 2 additions & 1 deletion kernel/examples/read-table-multi-threaded/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ edition = "2021"
publish = false

[dependencies]
arrow = { workspace = true, features = ["prettyprint", "chrono-tz"] }
arrow = { version = "53", features = ["prettyprint", "chrono-tz"] }
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"cloud",
"arrow_53",
"default-engine",
"sync-engine",
"developer-visibility",
Expand Down
3 changes: 2 additions & 1 deletion kernel/examples/read-table-single-threaded/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ edition = "2021"
publish = false

[dependencies]
arrow = { workspace = true, features = ["prettyprint", "chrono-tz"] }
arrow = { version = "53", features = ["prettyprint", "chrono-tz"] }
clap = { version = "4.5", features = ["derive"] }
delta_kernel = { path = "../../../kernel", features = [
"arrow_53",
"cloud",
"default-engine",
"sync-engine",
Expand Down
4 changes: 2 additions & 2 deletions kernel/src/actions/visitors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,8 @@ pub(crate) fn visit_deletion_vector_at<'a>(
mod tests {
use std::sync::Arc;

use arrow_array::{RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use crate::arrow::array::{RecordBatch, StringArray};
use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};

use super::*;
use crate::{
Expand Down
11 changes: 11 additions & 0 deletions kernel/src/arrow.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//! This module exists to help re-export the version of arrow used by default-gengine and other
//! parts of kernel that need arrow
#[cfg(all(feature = "arrow_53", feature = "arrow_54"))]
compile_error!("Multiple versions of the arrow cannot be used at the same time!");

#[cfg(feature = "arrow_53")]
pub use arrow_53::*;

#[cfg(feature = "arrow_54")]
pub use arrow_54::*;
5 changes: 3 additions & 2 deletions kernel/src/engine/arrow_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
use std::sync::Arc;

use arrow_schema::{
ArrowError, DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
use crate::arrow::datatypes::{
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
SchemaRef as ArrowSchemaRef, TimeUnit,
};
use crate::arrow::error::ArrowError;
use itertools::Itertools;

use crate::error::Error;
Expand Down
14 changes: 7 additions & 7 deletions kernel/src/engine/arrow_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ use crate::engine_data::{EngineData, EngineList, EngineMap, GetData, RowVisitor}
use crate::schema::{ColumnName, DataType};
use crate::{DeltaResult, Error};

use arrow_array::cast::AsArray;
use arrow_array::types::{Int32Type, Int64Type};
use arrow_array::{Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, StructArray};
use arrow_schema::{FieldRef, DataType as ArrowDataType};
use tracing::{debug};
use crate::arrow::array::cast::AsArray;
use crate::arrow::array::types::{Int32Type, Int64Type};
use crate::arrow::array::{Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, StructArray};
use crate::arrow::datatypes::{FieldRef, DataType as ArrowDataType};
use tracing::debug;

use std::collections::{HashMap, HashSet};

Expand Down Expand Up @@ -269,8 +269,8 @@ impl ArrowEngineData {
mod tests {
use std::sync::Arc;

use arrow_array::{RecordBatch, StringArray};
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use crate::arrow::array::{RecordBatch, StringArray};
use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};

use crate::{
actions::{get_log_schema, Metadata, Protocol},
Expand Down
31 changes: 16 additions & 15 deletions kernel/src/engine/arrow_expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,24 @@ use std::borrow::Borrow;
use std::collections::HashMap;
use std::sync::Arc;

use arrow_arith::boolean::{and_kleene, is_null, not, or_kleene};
use arrow_arith::numeric::{add, div, mul, sub};
use arrow_array::cast::AsArray;
use arrow_array::{types::*, MapArray};
use arrow_array::{
use crate::arrow::array::AsArray;
use crate::arrow::array::{types::*, MapArray};
use crate::arrow::array::{
Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Datum, Decimal128Array, Float32Array,
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, RecordBatch,
StringArray, StructArray, TimestampMicrosecondArray,
};
use arrow_buffer::OffsetBuffer;
use arrow_ord::cmp::{distinct, eq, gt, gt_eq, lt, lt_eq, neq};
use arrow_ord::comparison::in_list_utf8;
use arrow_schema::{
ArrowError, DataType as ArrowDataType, Field as ArrowField, Fields, IntervalUnit,
Schema as ArrowSchema, TimeUnit,
use crate::arrow::buffer::OffsetBuffer;
use crate::arrow::compute::concat;
use crate::arrow::compute::kernels::cmp::{distinct, eq, gt, gt_eq, lt, lt_eq, neq};
use crate::arrow::compute::kernels::comparison::in_list_utf8;
use crate::arrow::compute::kernels::numeric::{add, div, mul, sub};
use crate::arrow::compute::{and_kleene, is_null, not, or_kleene};
use crate::arrow::datatypes::{
DataType as ArrowDataType, Field as ArrowField, Fields, IntervalUnit, Schema as ArrowSchema,
TimeUnit,
};
use arrow_select::concat::concat;
use crate::arrow::error::ArrowError;
use itertools::Itertools;

use super::arrow_conversion::LIST_ARRAY_ROOT;
Expand Down Expand Up @@ -568,9 +569,9 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator {
mod tests {
use std::ops::{Add, Div, Mul, Sub};

use arrow_array::{GenericStringArray, Int32Array};
use arrow_buffer::ScalarBuffer;
use arrow_schema::{DataType, Field, Fields, Schema};
use crate::arrow::array::{GenericStringArray, Int32Array};
use crate::arrow::buffer::ScalarBuffer;
use crate::arrow::datatypes::{DataType, Field, Fields, Schema};

use super::*;
use crate::expressions::*;
Expand Down
2 changes: 1 addition & 1 deletion kernel/src/engine/arrow_get_data.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use arrow_array::{
use crate::arrow::array::{
types::{GenericStringType, Int32Type, Int64Type},
Array, BooleanArray, GenericByteArray, GenericListArray, MapArray, OffsetSizeTrait,
PrimitiveArray,
Expand Down
Loading

0 comments on commit 212b392

Please sign in to comment.