From 212b392e60326fbd3c209e83b7aa2406f412b2b2 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Wed, 15 Jan 2025 03:22:37 +0000 Subject: [PATCH 1/2] feat: introduce feature flags to select major arrow versions This change introduces arrow_53 and arrow_54 feature flags on kernel which are _required_ when using default-engine or sync-engine. Fundamentally we must push users of the crate to select their arrow major version through flags since Cargo _will_ include multiple major versions in the dependency tree which can cause ABI breakages when passing around symbols such as `RecordBatch` See #640 Signed-off-by: R. Tyler Croy --- Cargo.toml | 17 +----- acceptance/Cargo.toml | 7 +-- acceptance/src/data.rs | 20 ++++--- ffi/Cargo.toml | 2 +- kernel/Cargo.toml | 56 +++++++++---------- kernel/examples/inspect-table/Cargo.toml | 4 +- kernel/examples/read-table-changes/Cargo.toml | 5 +- .../read-table-multi-threaded/Cargo.toml | 3 +- .../read-table-single-threaded/Cargo.toml | 3 +- kernel/src/actions/visitors.rs | 4 +- kernel/src/arrow.rs | 11 ++++ kernel/src/engine/arrow_conversion.rs | 5 +- kernel/src/engine/arrow_data.rs | 14 ++--- kernel/src/engine/arrow_expression.rs | 31 +++++----- kernel/src/engine/arrow_get_data.rs | 2 +- kernel/src/engine/arrow_utils.rs | 22 ++++---- kernel/src/engine/default/file_stream.rs | 4 +- kernel/src/engine/default/json.rs | 8 +-- kernel/src/engine/default/parquet.rs | 17 +++--- kernel/src/engine/ensure_data_types.rs | 8 +-- .../src/engine/parquet_row_group_skipping.rs | 8 +-- .../parquet_row_group_skipping/tests.rs | 2 +- kernel/src/engine/sync/json.rs | 11 ++-- kernel/src/engine/sync/mod.rs | 2 +- kernel/src/engine/sync/parquet.rs | 4 +- kernel/src/error.rs | 15 +++-- kernel/src/lib.rs | 2 + kernel/src/parquet.rs | 11 ++++ kernel/src/scan/mod.rs | 4 +- kernel/src/transaction.rs | 43 ++++++-------- kernel/tests/cdf.rs | 4 +- kernel/tests/common/mod.rs | 8 +-- kernel/tests/golden_tables.rs | 16 +++--- kernel/tests/read.rs | 5 +- kernel/tests/write.rs | 32 +++++------ test-utils/Cargo.toml | 5 +- test-utils/src/lib.rs | 8 +-- 37 files changed, 210 insertions(+), 213 deletions(-) create mode 100644 kernel/src/arrow.rs create mode 100644 kernel/src/parquet.rs diff --git a/Cargo.toml b/Cargo.toml index ec7993736..b85704310 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,24 +20,9 @@ license = "Apache-2.0" repository = "https://github.com/delta-io/delta-kernel-rs" readme = "README.md" rust-version = "1.80" -version = "0.6.1" +version = "0.7.0" [workspace.dependencies] -# When changing the arrow version range, also modify ffi/Cargo.toml which has -# its own arrow version ranges witeh modified features. Failure to do so will -# result in compilation errors as two different sets of arrow dependencies may -# be sourced -arrow = { version = ">=53, <55" } -arrow-arith = { version = ">=53, <55" } -arrow-array = { version = ">=53, <55" } -arrow-buffer = { version = ">=53, <55" } -arrow-cast = { version = ">=53, <55" } -arrow-data = { version = ">=53, <55" } -arrow-ord = { version = ">=53, <55" } -arrow-json = { version = ">=53, <55" } -arrow-select = { version = ">=53, <55" } -arrow-schema = { version = ">=53, <55" } -parquet = { version = ">=53, <55", features = ["object_store"] } object_store = { version = ">=0.11, <0.12" } hdfs-native-object-store = "0.12.0" hdfs-native = "0.10.0" diff --git a/acceptance/Cargo.toml b/acceptance/Cargo.toml index 2854c7c39..e844007ef 100644 --- a/acceptance/Cargo.toml +++ b/acceptance/Cargo.toml @@ -14,19 +14,14 @@ rust-version.workspace = true release = false [dependencies] -arrow-array = { workspace = true } -arrow-cast = { workspace = true } -arrow-ord = { workspace = true } -arrow-select = { workspace = true } -arrow-schema = { workspace = true } delta_kernel = { path = "../kernel", features = [ "default-engine", + "arrow_53", "developer-visibility", ] } futures = "0.3" itertools = "0.13" object_store = { workspace = true } -parquet = { workspace = true } serde = { version = "1", features = ["derive"] } serde_json = "1" thiserror = "1" diff --git a/acceptance/src/data.rs b/acceptance/src/data.rs index c515d50c9..9b60b92e0 100644 --- a/acceptance/src/data.rs +++ b/acceptance/src/data.rs @@ -1,15 +1,18 @@ use std::{path::Path, sync::Arc}; -use arrow_array::{Array, RecordBatch}; -use arrow_ord::sort::{lexsort_to_indices, SortColumn}; -use arrow_schema::{DataType, Schema}; -use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::take}; +use delta_kernel::arrow::array::{Array, RecordBatch}; +use delta_kernel::arrow::compute::{ + concat_batches, filter_record_batch, lexsort_to_indices, take, SortColumn, +}; +use delta_kernel::arrow::datatypes::{DataType, Schema}; +use delta_kernel::parquet::arrow::async_reader::{ + ParquetObjectReader, ParquetRecordBatchStreamBuilder, +}; use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error, Table}; use futures::{stream::TryStreamExt, StreamExt}; use itertools::Itertools; use object_store::{local::LocalFileSystem, ObjectStore}; -use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; use crate::{TestCaseInfo, TestResult}; @@ -83,8 +86,11 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) { fn normalize_col(col: Arc) -> Arc { if let DataType::Timestamp(unit, Some(zone)) = col.data_type() { if **zone == *"+00:00" { - arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into()))) - .expect("Could not cast to UTC") + delta_kernel::arrow::compute::cast( + &col, + &DataType::Timestamp(*unit, Some("UTC".into())), + ) + .expect("Could not cast to UTC") } else { col } diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index aa4edc167..cb59461a4 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -24,7 +24,7 @@ url = "2" delta_kernel = { path = "../kernel", default-features = false, features = [ "developer-visibility", ] } -delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.1" } +delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.7.0" } # used if we use the default engine to be able to move arrow data into the c-ffi format arrow-schema = { version = ">=53, <55", default-features = false, features = [ diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index a045153cf..eebbc953a 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -39,27 +39,29 @@ uuid = "1.10.0" z85 = "3.0.5" # bring in our derive macros -delta_kernel_derive = { path = "../derive-macros", version = "0.6.1" } +delta_kernel_derive = { path = "../derive-macros", version = "0.7.0" } # used for developer-visibility visibility = "0.1.1" # Used in the sync engine tempfile = { version = "3", optional = true } + +# Arrow supported versions +## 53 # Used in default engine -arrow-buffer = { workspace = true, optional = true } -arrow-array = { workspace = true, optional = true, features = ["chrono-tz"] } -arrow-select = { workspace = true, optional = true } -arrow-arith = { workspace = true, optional = true } -arrow-cast = { workspace = true, optional = true } -arrow-json = { workspace = true, optional = true } -arrow-ord = { workspace = true, optional = true } -arrow-schema = { workspace = true, optional = true } +arrow_53 = { package = "arrow", version = "53", features = ["chrono-tz", "json", "prettyprint"], optional = true } +# Used in default and sync engine +parquet_53 = { package = "parquet", version = "53", features = ["async", "object_store"] , optional = true } +###### +## 54 +arrow_54 = { package = "arrow", version = "54", features = ["chrono-tz", "json", "prettyprint"], optional = true } +parquet_54 = { package = "parquet", version = "54", features = ["async", "object_store"] , optional = true } +###### + futures = { version = "0.3", optional = true } object_store = { workspace = true, optional = true } hdfs-native-object-store = { workspace = true, optional = true } -# Used in default and sync engine -parquet = { workspace = true, optional = true } # Used for fetching direct urls (like pre-signed urls) reqwest = { version = "0.12.8", default-features = false, optional = true } strum = { version = "0.26", features = ["derive"] } @@ -73,8 +75,20 @@ hdfs-native = { workspace = true, optional = true } walkdir = { workspace = true, optional = true } [features] -arrow-conversion = ["arrow-schema"] -arrow-expression = ["arrow-arith", "arrow-array", "arrow-buffer", "arrow-ord", "arrow-schema"] +# The default version to be expected +arrow = ["arrow_53"] +# The default version to be expected +parquet = ["parquet_53"] + +arrow_53 = ["dep:arrow_53", "parquet_53"] +parquet_53 = ["dep:parquet_53"] + +arrow_54 = ["dep:arrow_54", "parquet_54"] +parquet_54 = ["dep:parquet_54"] + +arrow-conversion = [] +arrow-expression = [] + cloud = [ "object_store/aws", "object_store/azure", @@ -89,16 +103,8 @@ default = [] default-engine-base = [ "arrow-conversion", "arrow-expression", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-json", - "arrow-schema", - "arrow-select", "futures", "object_store", - "parquet/async", - "parquet/object_store", "tokio", "uuid/v4", "uuid/fast-rng", @@ -119,13 +125,6 @@ default-engine-rustls = [ developer-visibility = [] sync-engine = [ - "arrow-cast", - "arrow-conversion", - "arrow-expression", - "arrow-array", - "arrow-json", - "arrow-select", - "parquet", "tempfile", ] integration-test = [ @@ -141,7 +140,6 @@ version = "=0.5.9" rustc_version = "0.4.1" [dev-dependencies] -arrow = { workspace = true, features = ["json", "prettyprint"] } delta_kernel = { path = ".", features = ["default-engine", "sync-engine"] } test_utils = { path = "../test-utils" } paste = "1.0" diff --git a/kernel/examples/inspect-table/Cargo.toml b/kernel/examples/inspect-table/Cargo.toml index b81a8ac5b..4208c6938 100644 --- a/kernel/examples/inspect-table/Cargo.toml +++ b/kernel/examples/inspect-table/Cargo.toml @@ -5,11 +5,11 @@ edition = "2021" publish = false [dependencies] -arrow-array = { workspace = true } -arrow-schema = { workspace = true } +arrow = "53" clap = { version = "4.5", features = ["derive"] } delta_kernel = { path = "../../../kernel", features = [ "cloud", + "arrow_53", "default-engine", "developer-visibility", ] } diff --git a/kernel/examples/read-table-changes/Cargo.toml b/kernel/examples/read-table-changes/Cargo.toml index 181da7dc6..d212045be 100644 --- a/kernel/examples/read-table-changes/Cargo.toml +++ b/kernel/examples/read-table-changes/Cargo.toml @@ -8,14 +8,13 @@ publish = false release = false [dependencies] -arrow-array = { workspace = true } -arrow-schema = { workspace = true } +arrow = { version = "53", features = ["prettyprint"] } clap = { version = "4.5", features = ["derive"] } delta_kernel = { path = "../../../kernel", features = [ "cloud", + "arrow_53", "default-engine", ] } env_logger = "0.11.3" url = "2" itertools = "0.13" -arrow = { workspace = true, features = ["prettyprint"] } diff --git a/kernel/examples/read-table-multi-threaded/Cargo.toml b/kernel/examples/read-table-multi-threaded/Cargo.toml index 3362e579a..8cb7c9cd3 100644 --- a/kernel/examples/read-table-multi-threaded/Cargo.toml +++ b/kernel/examples/read-table-multi-threaded/Cargo.toml @@ -5,10 +5,11 @@ edition = "2021" publish = false [dependencies] -arrow = { workspace = true, features = ["prettyprint", "chrono-tz"] } +arrow = { version = "53", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } delta_kernel = { path = "../../../kernel", features = [ "cloud", + "arrow_53", "default-engine", "sync-engine", "developer-visibility", diff --git a/kernel/examples/read-table-single-threaded/Cargo.toml b/kernel/examples/read-table-single-threaded/Cargo.toml index dc0458139..e71959e7b 100644 --- a/kernel/examples/read-table-single-threaded/Cargo.toml +++ b/kernel/examples/read-table-single-threaded/Cargo.toml @@ -5,9 +5,10 @@ edition = "2021" publish = false [dependencies] -arrow = { workspace = true, features = ["prettyprint", "chrono-tz"] } +arrow = { version = "53", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } delta_kernel = { path = "../../../kernel", features = [ + "arrow_53", "cloud", "default-engine", "sync-engine", diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 957befe80..34536ce1d 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -474,8 +474,8 @@ pub(crate) fn visit_deletion_vector_at<'a>( mod tests { use std::sync::Arc; - use arrow_array::{RecordBatch, StringArray}; - use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::{RecordBatch, StringArray}; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use super::*; use crate::{ diff --git a/kernel/src/arrow.rs b/kernel/src/arrow.rs new file mode 100644 index 000000000..0ea4b4d92 --- /dev/null +++ b/kernel/src/arrow.rs @@ -0,0 +1,11 @@ +//! This module exists to help re-export the version of arrow used by default-gengine and other +//! parts of kernel that need arrow + +#[cfg(all(feature = "arrow_53", feature = "arrow_54"))] +compile_error!("Multiple versions of the arrow cannot be used at the same time!"); + +#[cfg(feature = "arrow_53")] +pub use arrow_53::*; + +#[cfg(feature = "arrow_54")] +pub use arrow_54::*; diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs index 0b905ff3a..a425cd143 100644 --- a/kernel/src/engine/arrow_conversion.rs +++ b/kernel/src/engine/arrow_conversion.rs @@ -2,10 +2,11 @@ use std::sync::Arc; -use arrow_schema::{ - ArrowError, DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, +use crate::arrow::datatypes::{ + DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, }; +use crate::arrow::error::ArrowError; use itertools::Itertools; use crate::error::Error; diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index 7c2dd5f40..b96a4bd81 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -2,11 +2,11 @@ use crate::engine_data::{EngineData, EngineList, EngineMap, GetData, RowVisitor} use crate::schema::{ColumnName, DataType}; use crate::{DeltaResult, Error}; -use arrow_array::cast::AsArray; -use arrow_array::types::{Int32Type, Int64Type}; -use arrow_array::{Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, StructArray}; -use arrow_schema::{FieldRef, DataType as ArrowDataType}; -use tracing::{debug}; +use crate::arrow::array::cast::AsArray; +use crate::arrow::array::types::{Int32Type, Int64Type}; +use crate::arrow::array::{Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, StructArray}; +use crate::arrow::datatypes::{FieldRef, DataType as ArrowDataType}; +use tracing::debug; use std::collections::{HashMap, HashSet}; @@ -269,8 +269,8 @@ impl ArrowEngineData { mod tests { use std::sync::Arc; - use arrow_array::{RecordBatch, StringArray}; - use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::{RecordBatch, StringArray}; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use crate::{ actions::{get_log_schema, Metadata, Protocol}, diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index 8ee54ebd0..b7a845171 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -3,23 +3,24 @@ use std::borrow::Borrow; use std::collections::HashMap; use std::sync::Arc; -use arrow_arith::boolean::{and_kleene, is_null, not, or_kleene}; -use arrow_arith::numeric::{add, div, mul, sub}; -use arrow_array::cast::AsArray; -use arrow_array::{types::*, MapArray}; -use arrow_array::{ +use crate::arrow::array::AsArray; +use crate::arrow::array::{types::*, MapArray}; +use crate::arrow::array::{ Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Datum, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, RecordBatch, StringArray, StructArray, TimestampMicrosecondArray, }; -use arrow_buffer::OffsetBuffer; -use arrow_ord::cmp::{distinct, eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow_ord::comparison::in_list_utf8; -use arrow_schema::{ - ArrowError, DataType as ArrowDataType, Field as ArrowField, Fields, IntervalUnit, - Schema as ArrowSchema, TimeUnit, +use crate::arrow::buffer::OffsetBuffer; +use crate::arrow::compute::concat; +use crate::arrow::compute::kernels::cmp::{distinct, eq, gt, gt_eq, lt, lt_eq, neq}; +use crate::arrow::compute::kernels::comparison::in_list_utf8; +use crate::arrow::compute::kernels::numeric::{add, div, mul, sub}; +use crate::arrow::compute::{and_kleene, is_null, not, or_kleene}; +use crate::arrow::datatypes::{ + DataType as ArrowDataType, Field as ArrowField, Fields, IntervalUnit, Schema as ArrowSchema, + TimeUnit, }; -use arrow_select::concat::concat; +use crate::arrow::error::ArrowError; use itertools::Itertools; use super::arrow_conversion::LIST_ARRAY_ROOT; @@ -568,9 +569,9 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator { mod tests { use std::ops::{Add, Div, Mul, Sub}; - use arrow_array::{GenericStringArray, Int32Array}; - use arrow_buffer::ScalarBuffer; - use arrow_schema::{DataType, Field, Fields, Schema}; + use crate::arrow::array::{GenericStringArray, Int32Array}; + use crate::arrow::buffer::ScalarBuffer; + use crate::arrow::datatypes::{DataType, Field, Fields, Schema}; use super::*; use crate::expressions::*; diff --git a/kernel/src/engine/arrow_get_data.rs b/kernel/src/engine/arrow_get_data.rs index 145aab66b..fbed64df1 100644 --- a/kernel/src/engine/arrow_get_data.rs +++ b/kernel/src/engine/arrow_get_data.rs @@ -1,4 +1,4 @@ -use arrow_array::{ +use crate::arrow::array::{ types::{GenericStringType, Int32Type, Int64Type}, Array, BooleanArray, GenericByteArray, GenericListArray, MapArray, OffsetSizeTrait, PrimitiveArray, diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 06441b9d4..dcb2e2465 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -12,18 +12,18 @@ use crate::{ DeltaResult, EngineData, Error, }; -use arrow_array::{ +use crate::arrow::array::{ cast::AsArray, new_null_array, Array as ArrowArray, GenericListArray, OffsetSizeTrait, RecordBatch, StringArray, StructArray, }; -use arrow_json::{LineDelimitedWriter, ReaderBuilder}; -use arrow_schema::{ +use crate::arrow::json::{LineDelimitedWriter, ReaderBuilder}; +use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields, SchemaRef as ArrowSchemaRef, }; -use arrow_select::concat::concat_batches; +use crate::arrow::compute::concat_batches; +use crate::parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; use itertools::Itertools; -use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; use tracing::debug; macro_rules! prim_array_cmp { @@ -40,7 +40,7 @@ macro_rules! prim_array_cmp { .ok_or(Error::invalid_expression( format!("Cannot cast to list array: {}", $right_arr.data_type())) )?; - arrow_ord::comparison::in_list(prim_array, list_array).map(wrap_comparison_result) + crate::arrow::compute::kernels::comparison::in_list(prim_array, list_array).map(wrap_comparison_result) } )+ _ => Err(ArrowError::CastError( @@ -59,7 +59,7 @@ pub(crate) use prim_array_cmp; /// returns a tuples of (mask_indices: Vec, reorder_indices: /// Vec). `mask_indices` is used for generating the mask for reading from the pub(crate) fn make_arrow_error(s: impl Into) -> Error { - Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(s.into())).with_backtrace() + Error::Arrow(crate::arrow::error::ArrowError::InvalidArgumentError(s.into())).with_backtrace() } /* @@ -500,7 +500,7 @@ pub(crate) fn reorder_struct_array( match &reorder_index.transform { ReorderIndexTransform::Cast(target) => { let col = input_cols[parquet_position].as_ref(); - let col = Arc::new(arrow_cast::cast::cast(col, target)?); + let col = Arc::new(crate::arrow::compute::cast(col, target)?); let new_field = Arc::new( input_fields[parquet_position] .as_ref() @@ -679,14 +679,14 @@ pub(crate) fn to_json_bytes( mod tests { use std::sync::Arc; - use arrow::{ + use crate::arrow::{ array::AsArray, buffer::{OffsetBuffer, ScalarBuffer}, }; - use arrow_array::{ + use crate::arrow::array::{ Array, ArrayRef as ArrowArrayRef, BooleanArray, GenericListArray, Int32Array, StructArray, }; - use arrow_schema::{ + use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; diff --git a/kernel/src/engine/default/file_stream.rs b/kernel/src/engine/default/file_stream.rs index 075716a75..bcdc370a0 100644 --- a/kernel/src/engine/default/file_stream.rs +++ b/kernel/src/engine/default/file_stream.rs @@ -5,8 +5,8 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{ready, Context, Poll}; -use arrow_array::RecordBatch; -use arrow_schema::SchemaRef as ArrowSchemaRef; +use crate::arrow::array::RecordBatch; +use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; use futures::future::BoxFuture; use futures::stream::{BoxStream, Stream, StreamExt}; use futures::FutureExt; diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index ab296e12a..1a9bc5f74 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -5,8 +5,8 @@ use std::ops::Range; use std::sync::Arc; use std::task::{ready, Poll}; -use arrow_json::ReaderBuilder; -use arrow_schema::SchemaRef as ArrowSchemaRef; +use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; +use crate::arrow::json::ReaderBuilder; use bytes::{Buf, Bytes}; use futures::{StreamExt, TryStreamExt}; use object_store::path::Path; @@ -201,8 +201,8 @@ impl FileOpener for JsonOpener { mod tests { use std::path::PathBuf; - use arrow::array::{AsArray, RecordBatch, StringArray}; - use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::{AsArray, RecordBatch, StringArray}; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use itertools::Itertools; use object_store::{local::LocalFileSystem, ObjectStore}; diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index a65d329a2..382a81262 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -4,16 +4,16 @@ use std::collections::HashMap; use std::ops::Range; use std::sync::Arc; -use arrow_array::builder::{MapBuilder, MapFieldNames, StringBuilder}; -use arrow_array::{BooleanArray, Int64Array, RecordBatch, StringArray}; +use crate::arrow::array::builder::{MapBuilder, MapFieldNames, StringBuilder}; +use crate::arrow::array::{BooleanArray, Int64Array, RecordBatch, StringArray}; +use crate::parquet::arrow::arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, +}; +use crate::parquet::arrow::arrow_writer::ArrowWriter; +use crate::parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; use futures::StreamExt; use object_store::path::Path; use object_store::DynObjectStore; -use parquet::arrow::arrow_reader::{ - ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, -}; -use parquet::arrow::arrow_writer::ArrowWriter; -use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; use uuid::Uuid; use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; @@ -371,8 +371,7 @@ mod tests { use std::path::PathBuf; use std::time::{SystemTime, UNIX_EPOCH}; - use arrow_array::array::Array; - use arrow_array::RecordBatch; + use crate::arrow::array::{Array, RecordBatch}; use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; use url::Url; diff --git a/kernel/src/engine/ensure_data_types.rs b/kernel/src/engine/ensure_data_types.rs index 88ff01626..ab7fe26f0 100644 --- a/kernel/src/engine/ensure_data_types.rs +++ b/kernel/src/engine/ensure_data_types.rs @@ -2,7 +2,7 @@ use std::{collections::{HashMap, HashSet}, ops::Deref}; -use arrow_schema::{DataType as ArrowDataType, Field as ArrowField}; +use crate::arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField}; use itertools::Itertools; use crate::{ @@ -256,7 +256,7 @@ fn metadata_eq( #[cfg(test)] mod tests { - use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, Fields}; + use crate::arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField, Fields}; use crate::{ engine::ensure_data_types::ensure_data_types, @@ -276,8 +276,8 @@ mod tests { assert!(can_upcast_to_decimal(&Decimal128(5, 1), 6u8, 2i8)); assert!(can_upcast_to_decimal( &Decimal128(10, 5), - arrow_schema::DECIMAL128_MAX_PRECISION, - arrow_schema::DECIMAL128_MAX_SCALE - 5 + crate::arrow::datatypes::DECIMAL128_MAX_PRECISION, + crate::arrow::datatypes::DECIMAL128_MAX_SCALE - 5 )); assert!(can_upcast_to_decimal(&Int8, 3u8, 0i8)); diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index 0adae6c4b..7a973b204 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -3,10 +3,10 @@ use crate::expressions::{ColumnName, Expression, Scalar, UnaryExpression, Binary use crate::predicates::parquet_stats_skipping::ParquetStatsProvider; use crate::schema::{DataType, PrimitiveType}; use chrono::{DateTime, Days}; -use parquet::arrow::arrow_reader::ArrowReaderBuilder; -use parquet::file::metadata::RowGroupMetaData; -use parquet::file::statistics::Statistics; -use parquet::schema::types::ColumnDescPtr; +use crate::parquet::arrow::arrow_reader::ArrowReaderBuilder; +use crate::parquet::file::metadata::RowGroupMetaData; +use crate::parquet::file::statistics::Statistics; +use crate::parquet::schema::types::ColumnDescPtr; use std::collections::{HashMap, HashSet}; use tracing::debug; diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index 39a9c2ab5..96e9d3b27 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -2,7 +2,7 @@ use super::*; use crate::predicates::DataSkippingPredicateEvaluator as _; use crate::expressions::{column_name, column_expr}; use crate::Expression; -use parquet::arrow::arrow_reader::ArrowReaderMetadata; +use crate::parquet::arrow::arrow_reader::ArrowReaderMetadata; use std::fs::File; /// Performs an exhaustive set of reads against a specially crafted parquet file. diff --git a/kernel/src/engine/sync/json.rs b/kernel/src/engine/sync/json.rs index 3d33b1025..ddf61bd3c 100644 --- a/kernel/src/engine/sync/json.rs +++ b/kernel/src/engine/sync/json.rs @@ -1,6 +1,7 @@ use std::{fs::File, io::BufReader, io::Write}; -use arrow_schema::SchemaRef as ArrowSchemaRef; +use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; +use crate::arrow::json::ReaderBuilder; use tempfile::NamedTempFile; use url::Url; @@ -22,7 +23,7 @@ fn try_create_from_json( arrow_schema: ArrowSchemaRef, _predicate: Option, ) -> DeltaResult>> { - let json = arrow_json::ReaderBuilder::new(arrow_schema) + let json = ReaderBuilder::new(arrow_schema) .build(BufReader::new(file))? .map(|data| Ok(ArrowEngineData::new(data?))); Ok(json) @@ -92,10 +93,8 @@ mod tests { use std::sync::Arc; - use arrow_array::{RecordBatch, StringArray}; - use arrow_schema::DataType as ArrowDataType; - use arrow_schema::Field; - use arrow_schema::Schema as ArrowSchema; + use crate::arrow::array::{RecordBatch, StringArray}; + use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use serde_json::json; use url::Url; diff --git a/kernel/src/engine/sync/mod.rs b/kernel/src/engine/sync/mod.rs index f637ec105..ae80c23bd 100644 --- a/kernel/src/engine/sync/mod.rs +++ b/kernel/src/engine/sync/mod.rs @@ -7,7 +7,7 @@ use crate::{ FileMeta, FileSystemClient, JsonHandler, ParquetHandler, SchemaRef, }; -use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use crate::arrow::datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; use itertools::Itertools; use std::fs::File; use std::sync::Arc; diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 260ef321b..3ce01ef5b 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -1,7 +1,7 @@ use std::fs::File; -use arrow_schema::SchemaRef as ArrowSchemaRef; -use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ParquetRecordBatchReaderBuilder}; +use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; +use crate::parquet::arrow::arrow_reader::{ArrowReaderMetadata, ParquetRecordBatchReaderBuilder}; use super::read_files; use crate::engine::arrow_data::ArrowEngineData; diff --git a/kernel/src/error.rs b/kernel/src/error.rs index 815ef3e51..91e42821d 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -10,6 +10,9 @@ use crate::schema::{DataType, StructType}; use crate::table_properties::ParseIntervalError; use crate::Version; +#[cfg(any(feature = "default-engine-base", feature = "sync-engine"))] +use crate::arrow::error::ArrowError; + /// A [`std::result::Result`] that has the kernel [`Error`] as the error variant pub type DeltaResult = std::result::Result; @@ -29,7 +32,7 @@ pub enum Error { /// An error performing operations on arrow data #[cfg(any(feature = "default-engine-base", feature = "sync-engine"))] #[error(transparent)] - Arrow(arrow_schema::ArrowError), + Arrow(ArrowError), /// User tried to convert engine data to the wrong type #[error("Invalid engine data type. Could not convert to {0}")] @@ -58,10 +61,10 @@ pub enum Error { #[error("Internal error {0}. This is a kernel bug, please report.")] InternalError(String), - /// An error encountered while working with parquet data - #[cfg(feature = "parquet")] + /// An error enountered while working with parquet data + #[cfg(any(feature = "default-engine-base", feature = "sync-engine"))] #[error("Arrow error: {0}")] - Parquet(#[from] parquet::errors::ParquetError), + Parquet(#[from] crate::parquet::errors::ParquetError), /// An error interacting with the object_store crate // We don't use [#from] object_store::Error here as our From impl transforms @@ -304,8 +307,8 @@ from_with_backtrace!( ); #[cfg(any(feature = "default-engine-base", feature = "sync-engine"))] -impl From for Error { - fn from(value: arrow_schema::ArrowError) -> Self { +impl From for Error { + fn from(value: ArrowError) -> Self { Self::Arrow(value).with_backtrace() } } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 49dceea75..dd55ee719 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -86,6 +86,8 @@ pub mod table_features; pub mod table_properties; pub mod transaction; +pub mod arrow; +pub mod parquet; pub(crate) mod predicates; pub(crate) mod utils; diff --git a/kernel/src/parquet.rs b/kernel/src/parquet.rs new file mode 100644 index 000000000..357caf903 --- /dev/null +++ b/kernel/src/parquet.rs @@ -0,0 +1,11 @@ +//! This module exists to help re-export the version of arrow used by default-gengine and other +//! parts of kernel that need arrow + +#[cfg(all(feature = "arrow_53", feature = "arrow_54"))] +compile_error!("Multiple versions of the arrow cannot be used at the same time!"); + +#[cfg(feature = "arrow_53")] +pub use parquet_53::*; + +#[cfg(feature = "arrow_54")] +pub use parquet_54::*; diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 49af0222c..2d145c404 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -678,8 +678,8 @@ fn transform_to_logical_internal( pub(crate) mod test_utils { use std::sync::Arc; - use arrow_array::{RecordBatch, StringArray}; - use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::array::{RecordBatch, StringArray}; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use crate::{ actions::get_log_schema, diff --git a/kernel/src/transaction.rs b/kernel/src/transaction.rs index d74c2456a..4905668a4 100644 --- a/kernel/src/transaction.rs +++ b/kernel/src/transaction.rs @@ -339,11 +339,11 @@ mod tests { use crate::schema::MapType; use crate::{ExpressionHandler, FileSystemClient, JsonHandler, ParquetHandler}; - use arrow::json::writer::LineDelimitedWriter; - use arrow::record_batch::RecordBatch; - use arrow_array::builder::StringBuilder; - use arrow_schema::Schema as ArrowSchema; - use arrow_schema::{DataType as ArrowDataType, Field}; + use crate::arrow::array::{MapArray, MapBuilder, MapFieldNames, StringArray, StringBuilder}; + use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; + use crate::arrow::error::ArrowError; + use crate::arrow::json::writer::LineDelimitedWriter; + use crate::arrow::record_batch::RecordBatch; struct ExprEngine(Arc); @@ -371,16 +371,15 @@ mod tests { } } - fn build_map(entries: Vec<(&str, &str)>) -> arrow_array::MapArray { + fn build_map(entries: Vec<(&str, &str)>) -> MapArray { let key_builder = StringBuilder::new(); let val_builder = StringBuilder::new(); - let names = arrow_array::builder::MapFieldNames { + let names = MapFieldNames { entry: "entries".to_string(), key: "key".to_string(), value: "value".to_string(), }; - let mut builder = - arrow_array::builder::MapBuilder::new(Some(names), key_builder, val_builder); + let mut builder = MapBuilder::new(Some(names), key_builder, val_builder); for (key, val) in entries { builder.keys().append_value(key); builder.values().append_value(val); @@ -494,7 +493,7 @@ mod tests { engine_commit_info_schema, vec![ Arc::new(map_array), - Arc::new(arrow_array::StringArray::from(vec!["some_string"])), + Arc::new(StringArray::from(vec!["some_string"])), ], )?; @@ -533,7 +532,7 @@ mod tests { )])); let commit_info_batch = RecordBatch::try_new( engine_commit_info_schema, - vec![Arc::new(arrow_array::StringArray::new_null(1))], + vec![Arc::new(StringArray::new_null(1))], )?; let _ = generate_commit_info( @@ -542,12 +541,9 @@ mod tests { &ArrowEngineData::new(commit_info_batch), ) .map_err(|e| match e { - Error::Arrow(arrow_schema::ArrowError::SchemaError(_)) => (), + Error::Arrow(ArrowError::SchemaError(_)) => (), Error::Backtraced { source, .. } - if matches!( - &*source, - Error::Arrow(arrow_schema::ArrowError::SchemaError(_)) - ) => {} + if matches!(&*source, Error::Arrow(ArrowError::SchemaError(_))) => {} _ => panic!("expected arrow schema error error, got {:?}", e), }); @@ -564,7 +560,7 @@ mod tests { )])); let commit_info_batch = RecordBatch::try_new( engine_commit_info_schema, - vec![Arc::new(arrow_array::StringArray::new_null(1))], + vec![Arc::new(StringArray::new_null(1))], )?; let _ = generate_commit_info( @@ -573,12 +569,9 @@ mod tests { &ArrowEngineData::new(commit_info_batch), ) .map_err(|e| match e { - Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(_)) => (), + Error::Arrow(ArrowError::InvalidArgumentError(_)) => (), Error::Backtraced { source, .. } - if matches!( - &*source, - Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(_)) - ) => {} + if matches!(&*source, Error::Arrow(ArrowError::InvalidArgumentError(_))) => {} _ => panic!("expected arrow invalid arg error, got {:?}", e), }); @@ -644,16 +637,16 @@ mod tests { ), true, )])); - use arrow_array::builder::StringBuilder; + let key_builder = StringBuilder::new(); let val_builder = StringBuilder::new(); - let names = arrow_array::builder::MapFieldNames { + let names = crate::arrow::array::MapFieldNames { entry: "entries".to_string(), key: "key".to_string(), value: "value".to_string(), }; let mut builder = - arrow_array::builder::MapBuilder::new(Some(names), key_builder, val_builder); + crate::arrow::array::MapBuilder::new(Some(names), key_builder, val_builder); builder.append(is_null).unwrap(); let array = builder.finish(); diff --git a/kernel/tests/cdf.rs b/kernel/tests/cdf.rs index 2560dc71d..069018951 100644 --- a/kernel/tests/cdf.rs +++ b/kernel/tests/cdf.rs @@ -1,7 +1,7 @@ use std::{error, sync::Arc}; -use arrow::compute::filter_record_batch; -use arrow_array::RecordBatch; +use delta_kernel::arrow::array::RecordBatch; +use delta_kernel::arrow::compute::filter_record_batch; use delta_kernel::engine::sync::SyncEngine; use itertools::Itertools; diff --git a/kernel/tests/common/mod.rs b/kernel/tests/common/mod.rs index a918695b7..4268f0626 100644 --- a/kernel/tests/common/mod.rs +++ b/kernel/tests/common/mod.rs @@ -1,6 +1,6 @@ -use arrow::compute::filter_record_batch; -use arrow::record_batch::RecordBatch; -use arrow::util::pretty::pretty_format_batches; +use delta_kernel::arrow::compute::filter_record_batch; +use delta_kernel::arrow::record_batch::RecordBatch; +use delta_kernel::arrow::util::pretty::pretty_format_batches; use itertools::Itertools; use crate::ArrowEngineData; @@ -24,7 +24,7 @@ macro_rules! sort_lines { #[macro_export] macro_rules! assert_batches_sorted_eq { ($expected_lines_sorted: expr, $CHUNKS: expr) => { - let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS) + let formatted = delta_kernel::arrow::util::pretty::pretty_format_batches($CHUNKS) .unwrap() .to_string(); // fix for windows: \r\n --> diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index 120271ef2..2b1bc1a71 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -3,23 +3,23 @@ //! Data (golden tables) are stored in tests/golden_data/.tar.zst //! Each table directory has a table/ and expected/ subdirectory with the input/output respectively -use arrow::array::AsArray; -use arrow::{compute::filter_record_batch, record_batch::RecordBatch}; -use arrow_ord::sort::{lexsort_to_indices, SortColumn}; -use arrow_schema::{FieldRef, Schema}; -use arrow_select::{concat::concat_batches, take::take}; +use delta_kernel::arrow::array::{Array, AsArray, StructArray}; +use delta_kernel::arrow::compute::{concat_batches, take}; +use delta_kernel::arrow::compute::{lexsort_to_indices, SortColumn}; +use delta_kernel::arrow::datatypes::{DataType, FieldRef, Schema}; +use delta_kernel::arrow::{compute::filter_record_batch, record_batch::RecordBatch}; use itertools::Itertools; use paste::paste; use std::path::{Path, PathBuf}; use std::sync::Arc; +use delta_kernel::parquet::arrow::async_reader::{ + ParquetObjectReader, ParquetRecordBatchStreamBuilder, +}; use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Table}; use futures::{stream::TryStreamExt, StreamExt}; use object_store::{local::LocalFileSystem, ObjectStore}; -use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; -use arrow_array::{Array, StructArray}; -use arrow_schema::DataType; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; use delta_kernel::engine::default::DefaultEngine; diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 46a72d309..73b3eab96 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -3,10 +3,9 @@ use std::ops::Not; use std::path::PathBuf; use std::sync::Arc; -use arrow::compute::filter_record_batch; -use arrow_schema::SchemaRef as ArrowSchemaRef; -use arrow_select::concat::concat_batches; use delta_kernel::actions::deletion_vector::split_vector; +use delta_kernel::arrow::compute::{concat_batches, filter_record_batch}; +use delta_kernel::arrow::datatypes::SchemaRef as ArrowSchemaRef; use delta_kernel::engine::arrow_data::ArrowEngineData; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; use delta_kernel::engine::default::DefaultEngine; diff --git a/kernel/tests/write.rs b/kernel/tests/write.rs index 2ee6dfdd5..710999f52 100644 --- a/kernel/tests/write.rs +++ b/kernel/tests/write.rs @@ -1,10 +1,12 @@ use std::collections::HashMap; use std::sync::Arc; -use arrow::array::{Int32Array, StringArray}; -use arrow::record_batch::RecordBatch; -use arrow_schema::Schema as ArrowSchema; -use arrow_schema::{DataType as ArrowDataType, Field}; +use delta_kernel::arrow::array::{ + Int32Array, MapBuilder, MapFieldNames, StringArray, StringBuilder, +}; +use delta_kernel::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; +use delta_kernel::arrow::error::ArrowError; +use delta_kernel::arrow::record_batch::RecordBatch; use itertools::Itertools; use object_store::local::LocalFileSystem; use object_store::memory::InMemory; @@ -120,15 +122,14 @@ fn new_commit_info() -> DeltaResult> { false, )])); - use arrow_array::builder::StringBuilder; let key_builder = StringBuilder::new(); let val_builder = StringBuilder::new(); - let names = arrow_array::builder::MapFieldNames { + let names = MapFieldNames { entry: "entries".to_string(), key: "key".to_string(), value: "value".to_string(), }; - let mut builder = arrow_array::builder::MapBuilder::new(Some(names), key_builder, val_builder); + let mut builder = MapBuilder::new(Some(names), key_builder, val_builder); builder.keys().append_value("engineInfo"); builder.values().append_value("default engine"); builder.append(true).unwrap(); @@ -349,7 +350,7 @@ async fn test_append() -> Result<(), Box> { let append_data = [[1, 2, 3], [4, 5, 6]].map(|data| -> DeltaResult<_> { let data = RecordBatch::try_new( Arc::new(schema.as_ref().try_into()?), - vec![Arc::new(arrow::array::Int32Array::from(data.to_vec()))], + vec![Arc::new(Int32Array::from(data.to_vec()))], )?; Ok(Box::new(ArrowEngineData::new(data))) }); @@ -441,9 +442,7 @@ async fn test_append() -> Result<(), Box> { test_read( &ArrowEngineData::new(RecordBatch::try_new( Arc::new(schema.as_ref().try_into()?), - vec![Arc::new(arrow::array::Int32Array::from(vec![ - 1, 2, 3, 4, 5, 6, - ]))], + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6]))], )?), &table, engine, @@ -487,7 +486,7 @@ async fn test_append_partitioned() -> Result<(), Box> { let append_data = [[1, 2, 3], [4, 5, 6]].map(|data| -> DeltaResult<_> { let data = RecordBatch::try_new( Arc::new(data_schema.as_ref().try_into()?), - vec![Arc::new(arrow::array::Int32Array::from(data.to_vec()))], + vec![Arc::new(Int32Array::from(data.to_vec()))], )?; Ok(Box::new(ArrowEngineData::new(data))) }); @@ -627,7 +626,7 @@ async fn test_append_invalid_schema() -> Result<(), Box> let append_data = [["a", "b"], ["c", "d"]].map(|data| -> DeltaResult<_> { let data = RecordBatch::try_new( Arc::new(data_schema.as_ref().try_into()?), - vec![Arc::new(arrow::array::StringArray::from(data.to_vec()))], + vec![Arc::new(StringArray::from(data.to_vec()))], )?; Ok(Box::new(ArrowEngineData::new(data))) }); @@ -653,12 +652,9 @@ async fn test_append_invalid_schema() -> Result<(), Box> let mut write_metadata = futures::future::join_all(tasks).await.into_iter().flatten(); assert!(write_metadata.all(|res| match res { - Err(KernelError::Arrow(arrow_schema::ArrowError::SchemaError(_))) => true, + Err(KernelError::Arrow(ArrowError::SchemaError(_))) => true, Err(KernelError::Backtraced { source, .. }) - if matches!( - &*source, - KernelError::Arrow(arrow_schema::ArrowError::SchemaError(_)) - ) => + if matches!(&*source, KernelError::Arrow(ArrowError::SchemaError(_))) => true, _ => false, })); diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index 0a90e96ed..b602b2e68 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -12,9 +12,6 @@ version.workspace = true release = false [dependencies] -arrow-array = { workspace = true, features = ["chrono-tz"] } -arrow-schema = { workspace = true } -delta_kernel = { path = "../kernel", features = [ "default-engine" ] } +delta_kernel = { path = "../kernel", features = [ "default-engine" ] } itertools = "0.13.0" object_store = { workspace = true } -parquet = { workspace = true } diff --git a/test-utils/src/lib.rs b/test-utils/src/lib.rs index 2605bea56..d69e6a0f1 100644 --- a/test-utils/src/lib.rs +++ b/test-utils/src/lib.rs @@ -2,14 +2,14 @@ use std::sync::Arc; -use arrow_array::{ArrayRef, Int32Array, RecordBatch, StringArray}; -use arrow_schema::ArrowError; +use delta_kernel::arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray}; +use delta_kernel::arrow::error::ArrowError; use delta_kernel::engine::arrow_data::ArrowEngineData; +use delta_kernel::parquet::arrow::arrow_writer::ArrowWriter; +use delta_kernel::parquet::file::properties::WriterProperties; use delta_kernel::EngineData; use itertools::Itertools; use object_store::{path::Path, ObjectStore}; -use parquet::arrow::arrow_writer::ArrowWriter; -use parquet::file::properties::WriterProperties; /// A common useful initial metadata and protocol. Also includes a single commitInfo pub const METADATA: &str = r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isBlindAppend":true}} From a9694cd1620f2e9d50f4e18346118056a02d9264 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Mon, 20 Jan 2025 20:14:12 +0000 Subject: [PATCH 2/2] chore: unify some default features for GitHub Actions Because the arrow_53 and arrow_54 flags are mutually exclusive, we can no longer rely on the --all-features flag :sob: Signed-off-by: R. Tyler Croy --- .github/workflows/build.yml | 14 +++++++------- .github/workflows/default-cargo-features | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/default-cargo-features diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a8a24dd07..c8d6d67f3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -37,10 +37,10 @@ jobs: cargo install cargo-msrv --locked - name: verify-msrv run: | - cargo msrv --path kernel/ verify --all-features - cargo msrv --path derive-macros/ verify --all-features - cargo msrv --path ffi/ verify --all-features - cargo msrv --path ffi-proc-macros/ verify --all-features + cargo msrv --path kernel/ verify --features $(< .github/workflows/default-cargo-features) + cargo msrv --path derive-macros/ verify --features $(< .github/workflows/default-cargo-features) + cargo msrv --path ffi/ verify --features $(< .github/workflows/default-cargo-features) + cargo msrv --path ffi-proc-macros/ verify --features $(< .github/workflows/default-cargo-features) msrv-run-tests: runs-on: ubuntu-latest steps: @@ -104,7 +104,7 @@ jobs: - name: check kernel builds with no-default-features run: cargo build -p delta_kernel --no-default-features - name: build and lint with clippy - run: cargo clippy --benches --tests --all-features -- -D warnings + run: cargo clippy --benches --tests --features $(< .github/workflows/default-cargo-features) -- -D warnings - name: lint without default features run: cargo clippy --no-default-features -- -D warnings - name: check kernel builds with default-engine @@ -129,7 +129,7 @@ jobs: override: true - uses: Swatinem/rust-cache@v2 - name: test - run: cargo test --workspace --verbose --all-features -- --skip read_table_version_hdfs + run: cargo test --workspace --verbose --features $(< .github/workflows/default-cargo-features) -- --skip read_table_version_hdfs ffi_test: runs-on: ${{ matrix.os }} @@ -229,7 +229,7 @@ jobs: uses: taiki-e/install-action@cargo-llvm-cov - uses: Swatinem/rust-cache@v2 - name: Generate code coverage - run: cargo llvm-cov --all-features --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs + run: cargo llvm-cov --features $(< .github/workflows/default-cargo-features) --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: diff --git a/.github/workflows/default-cargo-features b/.github/workflows/default-cargo-features new file mode 100644 index 000000000..bee74feef --- /dev/null +++ b/.github/workflows/default-cargo-features @@ -0,0 +1 @@ +integration-test,default-engine,default-engine-rustls,cloud,arrow,sync-engine