From 1089df9335b223b534d193e4022f8d8f12f43abf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Feb 2024 06:40:27 -0500 Subject: [PATCH 1/2] Add `datafusion-functions-array` crate --- .github/workflows/rust.yml | 3 + Cargo.toml | 3 +- README.md | 1 + datafusion-cli/Cargo.lock | 68 +++-- datafusion/core/Cargo.toml | 6 +- datafusion/core/src/execution/context/mod.rs | 5 + datafusion/core/src/lib.rs | 6 + datafusion/core/src/prelude.rs | 2 + .../tests/dataframe/dataframe_functions.rs | 30 ++- datafusion/expr/src/built_in_function.rs | 13 - datafusion/expr/src/expr_fn.rs | 7 - datafusion/functions-array/Cargo.toml | 45 ++++ datafusion/functions-array/README.md | 27 ++ datafusion/functions-array/src/kernels.rs | 254 ++++++++++++++++++ datafusion/functions-array/src/lib.rs | 56 ++++ datafusion/functions-array/src/macros.rs | 79 ++++++ datafusion/functions-array/src/udf.rs | 85 ++++++ .../physical-expr/src/array_expressions.rs | 214 --------------- datafusion/physical-expr/src/functions.rs | 3 - datafusion/proto/proto/datafusion.proto | 2 +- datafusion/proto/src/generated/pbjson.rs | 3 - datafusion/proto/src/generated/prost.rs | 4 +- .../proto/src/logical_plan/from_proto.rs | 14 +- datafusion/proto/src/logical_plan/to_proto.rs | 1 - 24 files changed, 644 insertions(+), 287 deletions(-) create mode 100644 datafusion/functions-array/Cargo.toml create mode 100644 datafusion/functions-array/README.md create mode 100644 datafusion/functions-array/src/kernels.rs create mode 100644 datafusion/functions-array/src/lib.rs create mode 100644 datafusion/functions-array/src/macros.rs create mode 100644 datafusion/functions-array/src/udf.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c94137ebd1f9..d0ce996a503e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -82,6 +82,9 @@ jobs: - name: Check function packages (encoding_expressions) run: cargo check --no-default-features --features=encoding_expressions -p datafusion + - name: Check function packages (array_expressions) + run: cargo check --no-default-features --features=array_expressions -p datafusion + - name: Check Cargo.lock for datafusion-cli run: | # If this test fails, try running `cargo update` in the `datafusion-cli` directory diff --git a/Cargo.toml b/Cargo.toml index e99618dd9f9e..dd6cb5e9e00f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [workspace] exclude = ["datafusion-cli"] -members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/functions", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/physical-plan", "datafusion/proto", "datafusion/proto/gen", "datafusion/sql", "datafusion/sqllogictest", "datafusion/substrait", "datafusion/wasmtest", "datafusion-examples", "docs", "test-utils", "benchmarks", +members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/functions", "datafusion/functions-array", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/physical-plan", "datafusion/proto", "datafusion/proto/gen", "datafusion/sql", "datafusion/sqllogictest", "datafusion/substrait", "datafusion/wasmtest", "datafusion-examples", "docs", "test-utils", "benchmarks", ] resolver = "2" @@ -51,6 +51,7 @@ datafusion-common = { path = "datafusion/common", version = "35.0.0" } datafusion-execution = { path = "datafusion/execution", version = "35.0.0" } datafusion-expr = { path = "datafusion/expr", version = "35.0.0" } datafusion-functions = { path = "datafusion/functions", version = "35.0.0" } +datafusion-functions-array = { path = "datafusion/functions-array", version = "35.0.0" } datafusion-optimizer = { path = "datafusion/optimizer", version = "35.0.0" } datafusion-physical-expr = { path = "datafusion/physical-expr", version = "35.0.0" } datafusion-physical-plan = { path = "datafusion/physical-plan", version = "35.0.0" } diff --git a/README.md b/README.md index 4cbc5bb0ad50..44e06e1b6a92 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ This crate has several [features] which can be specified in your `Cargo.toml`. Default features: +- `array_expressions`: functions for working with arrays such as `array_to_string` - `compression`: reading files compressed with `xz2`, `bzip2`, `flate2`, and `zstd` - `crypto_expressions`: cryptographic functions such as `md5` and `sha256` - `encoding_expressions`: `encode` and `decode` functions diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 072898cda46d..5ff41ed61d11 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -84,9 +84,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2faccea4cc4ab4a667ce676a30e8ec13922a692c99bb8f5b11f1502c72e04220" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" [[package]] name = "apache-avro" @@ -1116,6 +1116,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-functions", + "datafusion-functions-array", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-plan", @@ -1238,6 +1239,18 @@ dependencies = [ "log", ] +[[package]] +name = "datafusion-functions-array" +version = "35.0.0" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "log", + "paste", +] + [[package]] name = "datafusion-optimizer" version = "35.0.0" @@ -1738,9 +1751,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" +checksum = "d0c62115964e08cb8039170eb33c1d0e2388a256930279edca206fff675f82c3" [[package]] name = "hex" @@ -1950,9 +1963,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.67" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" +checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee" dependencies = [ "wasm-bindgen", ] @@ -2230,9 +2243,9 @@ dependencies = [ [[package]] name = "num-complex" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" dependencies = [ "num-traits", ] @@ -2292,7 +2305,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.3.4", + "hermit-abi 0.3.5", "libc", ] @@ -3306,13 +3319,12 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67" dependencies = [ "cfg-if", "fastrand 2.0.1", - "redox_syscall", "rustix", "windows-sys 0.52.0", ] @@ -3624,9 +3636,9 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" [[package]] name = "unicode-width" @@ -3727,9 +3739,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" +checksum = "c1e124130aee3fb58c5bdd6b639a0509486b0338acaaae0c84a5124b0f588b7f" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3737,9 +3749,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" +checksum = "c9e7e1900c352b609c8488ad12639a311045f40a35491fb69ba8c12f758af70b" dependencies = [ "bumpalo", "log", @@ -3752,9 +3764,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.40" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bde2032aeb86bdfaecc8b261eef3cba735cc426c1f3a3416d1e0791be95fc461" +checksum = "877b9c3f61ceea0e56331985743b13f3d25c406a7098d45180fb5f09bc19ed97" dependencies = [ "cfg-if", "js-sys", @@ -3764,9 +3776,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" +checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3774,9 +3786,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" +checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ "proc-macro2", "quote", @@ -3787,9 +3799,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.90" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" +checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838" [[package]] name = "wasm-streams" @@ -3806,9 +3818,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.67" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" +checksum = "96565907687f7aceb35bc5fc03770a8a0471d82e479f25832f54a0e3f4b28446" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 2d795d0f8369..64fff9cbc25c 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -38,11 +38,12 @@ path = "src/lib.rs" [features] # Used to enable the avro format +array_expressions = ["datafusion-functions-array"] avro = ["apache-avro", "num-traits", "datafusion-common/avro"] backtrace = ["datafusion-common/backtrace"] compression = ["xz2", "bzip2", "flate2", "zstd", "async-compression", "tokio-util"] crypto_expressions = ["datafusion-physical-expr/crypto_expressions", "datafusion-optimizer/crypto_expressions"] -default = ["crypto_expressions", "encoding_expressions", "regex_expressions", "unicode_expressions", "compression", "parquet"] +default = ["array_expressions", "crypto_expressions", "encoding_expressions", "regex_expressions", "unicode_expressions", "compression", "parquet"] encoding_expressions = ["datafusion-functions/encoding_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) force_hash_collisions = [] @@ -68,7 +69,8 @@ dashmap = { workspace = true } datafusion-common = { path = "../common", version = "35.0.0", features = ["object_store"], default-features = false } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-functions = { path = "../functions", version = "35.0.0" } +datafusion-functions = { workspace = true } +datafusion-functions-array = { workspace = true, optional = true } datafusion-optimizer = { path = "../optimizer", version = "35.0.0", default-features = false } datafusion-physical-expr = { path = "../physical-expr", version = "35.0.0", default-features = false } datafusion-physical-plan = { workspace = true } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index b9039c5c9273..ca7667a42c5e 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1348,6 +1348,11 @@ impl SessionState { datafusion_functions::register_all(&mut new_self) .expect("can not register built in functions"); + // register crate of array expressions (if enabled) + #[cfg(feature = "array_expressions")] + datafusion_functions_array::register_all(&mut new_self) + .expect("can not register array expressions"); + new_self } /// Returns new [`SessionState`] using the provided diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 0f7292e1c3d3..d78d7a38a1c3 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -521,6 +521,12 @@ pub mod functions { pub use datafusion_functions::*; } +/// re-export of [`datafusion_functions_array`] crate, if "array_expressions" feature is enabled +pub mod functions_array { + #[cfg(feature = "array_expressions")] + pub use datafusion_functions::*; +} + #[cfg(test)] pub mod test; pub mod test_util; diff --git a/datafusion/core/src/prelude.rs b/datafusion/core/src/prelude.rs index 69c33355402b..d82a5a2cc1a1 100644 --- a/datafusion/core/src/prelude.rs +++ b/datafusion/core/src/prelude.rs @@ -39,6 +39,8 @@ pub use datafusion_expr::{ Expr, }; pub use datafusion_functions::expr_fn::*; +#[cfg(feature = "array_expressions")] +pub use datafusion_functions_array::expr_fn::*; pub use std::ops::Not; pub use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 486ea712edeb..48b9120698ad 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -20,6 +20,8 @@ use arrow::{ array::{Int32Array, StringArray}, record_batch::RecordBatch, }; +use arrow_array::types::Int32Type; +use arrow_array::ListArray; use arrow_schema::SchemaRef; use std::sync::Arc; @@ -40,6 +42,7 @@ fn test_schema() -> SchemaRef { Arc::new(Schema::new(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::Int32, false), + Field::new("l", DataType::new_list(DataType::Int32, true), true), ])) } @@ -57,6 +60,12 @@ async fn create_test_table() -> Result { "123AbcDef", ])), Arc::new(Int32Array::from(vec![1, 10, 10, 100])), + Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ])), ], )?; @@ -67,7 +76,7 @@ async fn create_test_table() -> Result { ctx.table("test").await } -/// Excutes an expression on the test dataframe as a select. +/// Executes an expression on the test dataframe as a select. /// Compares formatted output of a record batch with an expected /// vector of strings, using the assert_batch_eq! macro macro_rules! assert_fn_batches { @@ -841,3 +850,22 @@ async fn test_fn_decode() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn test_fn_array_to_string() -> Result<()> { + let expr = array_to_string(col("l"), lit("***")); + + let expected = [ + "+-------------------------------------+", + "| array_to_string(test.l,Utf8(\"***\")) |", + "+-------------------------------------+", + "| 0***1***2 |", + "| |", + "| 3***5 |", + "| 6***7 |", + "+-------------------------------------+", + ]; + assert_fn_batches!(expr, expected); + + Ok(()) +} diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 4cdf0c4a11dd..79b08758e118 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -175,8 +175,6 @@ pub enum BuiltinScalarFunction { ArrayReverse, /// array_slice ArraySlice, - /// array_to_string - ArrayToString, /// array_intersect ArrayIntersect, /// array_union @@ -432,7 +430,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayReverse => Volatility::Immutable, BuiltinScalarFunction::Flatten => Volatility::Immutable, BuiltinScalarFunction::ArraySlice => Volatility::Immutable, - BuiltinScalarFunction::ArrayToString => Volatility::Immutable, BuiltinScalarFunction::ArrayIntersect => Volatility::Immutable, BuiltinScalarFunction::ArrayUnion => Volatility::Immutable, BuiltinScalarFunction::ArrayResize => Volatility::Immutable, @@ -628,7 +625,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayReverse => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArraySlice => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArrayResize => Ok(input_expr_types[0].clone()), - BuiltinScalarFunction::ArrayToString => Ok(Utf8), BuiltinScalarFunction::ArrayIntersect => { match (input_expr_types[0].clone(), input_expr_types[1].clone()) { (DataType::Null, DataType::Null) | (DataType::Null, _) => { @@ -979,9 +975,6 @@ impl BuiltinScalarFunction { Signature::variadic_any(self.volatility()) } - BuiltinScalarFunction::ArrayToString => { - Signature::variadic_any(self.volatility()) - } BuiltinScalarFunction::ArrayIntersect => Signature::any(2, self.volatility()), BuiltinScalarFunction::ArrayUnion => Signature::any(2, self.volatility()), BuiltinScalarFunction::Cardinality => Signature::any(1, self.volatility()), @@ -1583,12 +1576,6 @@ impl BuiltinScalarFunction { } BuiltinScalarFunction::ArrayReverse => &["array_reverse", "list_reverse"], BuiltinScalarFunction::ArraySlice => &["array_slice", "list_slice"], - BuiltinScalarFunction::ArrayToString => &[ - "array_to_string", - "list_to_string", - "array_join", - "list_join", - ], BuiltinScalarFunction::ArrayUnion => &["array_union", "list_union"], BuiltinScalarFunction::Cardinality => &["cardinality"], BuiltinScalarFunction::ArrayResize => &["array_resize", "list_resize"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 877066aabfed..674d55e8eef5 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -740,12 +740,6 @@ scalar_expr!( array begin end stride, "returns a slice of the array." ); -scalar_expr!( - ArrayToString, - array_to_string, - array delimiter, - "converts each element to its text representation." -); scalar_expr!(ArrayUnion, array_union, array1 array2, "returns an array of the elements in the union of array1 and array2 without duplicates."); scalar_expr!( @@ -1447,7 +1441,6 @@ mod test { test_scalar_expr!(ArrayReplace, array_replace, array, from, to); test_scalar_expr!(ArrayReplaceN, array_replace_n, array, from, to, max); test_scalar_expr!(ArrayReplaceAll, array_replace_all, array, from, to); - test_scalar_expr!(ArrayToString, array_to_string, array, delimiter); test_unary_scalar_expr!(Cardinality, cardinality); test_nary_scalar_expr!(MakeArray, array, input); diff --git a/datafusion/functions-array/Cargo.toml b/datafusion/functions-array/Cargo.toml new file mode 100644 index 000000000000..9cf769bf294e --- /dev/null +++ b/datafusion/functions-array/Cargo.toml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-functions-array" +description = "Array Function packages for the DataFusion query engine" +keywords = ["datafusion", "logical", "plan", "expressions"] +readme = "README.md" +version = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } + +[features] + +[lib] +name = "datafusion_functions_array" +path = "src/lib.rs" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow = { workspace = true } +datafusion-common = { workspace = true } +datafusion-execution = { workspace = true } +datafusion-expr = { workspace = true } +log = "0.4.20" +paste = "1.0.14" diff --git a/datafusion/functions-array/README.md b/datafusion/functions-array/README.md new file mode 100644 index 000000000000..25deca8e1c77 --- /dev/null +++ b/datafusion/functions-array/README.md @@ -0,0 +1,27 @@ + + +# DataFusion Array Function Library + +[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. + +This crate contains functions for working with arrays, such as `array_append` that work with +`ListArray`, `LargeListArray` and `FixedListArray` types from the `arrow` crate. + +[df]: https://crates.io/crates/datafusion diff --git a/datafusion/functions-array/src/kernels.rs b/datafusion/functions-array/src/kernels.rs new file mode 100644 index 000000000000..1b96e01d8b9a --- /dev/null +++ b/datafusion/functions-array/src/kernels.rs @@ -0,0 +1,254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! implementation kernels for array functions + +use arrow::array::{ + Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray, + Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, OffsetSizeTrait, + StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, +}; +use arrow::datatypes::DataType; +use datafusion_common::cast::{as_large_list_array, as_list_array, as_string_array}; +use datafusion_common::{exec_err, DataFusionError}; +use std::any::type_name; +use std::sync::Arc; + +macro_rules! downcast_arg { + ($ARG:expr, $ARRAY_TYPE:ident) => {{ + $ARG.as_any().downcast_ref::<$ARRAY_TYPE>().ok_or_else(|| { + DataFusionError::Internal(format!( + "could not cast to {}", + type_name::<$ARRAY_TYPE>() + )) + })? + }}; +} + +macro_rules! to_string { + ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{ + let arr = downcast_arg!($ARRAY, $ARRAY_TYPE); + for x in arr { + match x { + Some(x) => { + $ARG.push_str(&x.to_string()); + $ARG.push_str($DELIMITER); + } + None => { + if $WITH_NULL_STRING { + $ARG.push_str($NULL_STRING); + $ARG.push_str($DELIMITER); + } + } + } + } + Ok($ARG) + }}; +} + +macro_rules! call_array_function { + ($DATATYPE:expr, false) => { + match $DATATYPE { + DataType::Utf8 => array_function!(StringArray), + DataType::LargeUtf8 => array_function!(LargeStringArray), + DataType::Boolean => array_function!(BooleanArray), + DataType::Float32 => array_function!(Float32Array), + DataType::Float64 => array_function!(Float64Array), + DataType::Int8 => array_function!(Int8Array), + DataType::Int16 => array_function!(Int16Array), + DataType::Int32 => array_function!(Int32Array), + DataType::Int64 => array_function!(Int64Array), + DataType::UInt8 => array_function!(UInt8Array), + DataType::UInt16 => array_function!(UInt16Array), + DataType::UInt32 => array_function!(UInt32Array), + DataType::UInt64 => array_function!(UInt64Array), + _ => unreachable!(), + } + }; + ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{ + match $DATATYPE { + DataType::List(_) => array_function!(ListArray), + DataType::Utf8 => array_function!(StringArray), + DataType::LargeUtf8 => array_function!(LargeStringArray), + DataType::Boolean => array_function!(BooleanArray), + DataType::Float32 => array_function!(Float32Array), + DataType::Float64 => array_function!(Float64Array), + DataType::Int8 => array_function!(Int8Array), + DataType::Int16 => array_function!(Int16Array), + DataType::Int32 => array_function!(Int32Array), + DataType::Int64 => array_function!(Int64Array), + DataType::UInt8 => array_function!(UInt8Array), + DataType::UInt16 => array_function!(UInt16Array), + DataType::UInt32 => array_function!(UInt32Array), + DataType::UInt64 => array_function!(UInt64Array), + _ => unreachable!(), + } + }}; +} + +/// Array_to_string SQL function +pub(super) fn array_to_string(args: &[ArrayRef]) -> datafusion_common::Result { + if args.len() < 2 || args.len() > 3 { + return exec_err!("array_to_string expects two or three arguments"); + } + + let arr = &args[0]; + + let delimiters = as_string_array(&args[1])?; + let delimiters: Vec> = delimiters.iter().collect(); + + let mut null_string = String::from(""); + let mut with_null_string = false; + if args.len() == 3 { + null_string = as_string_array(&args[2])?.value(0).to_string(); + with_null_string = true; + } + + fn compute_array_to_string( + arg: &mut String, + arr: ArrayRef, + delimiter: String, + null_string: String, + with_null_string: bool, + ) -> datafusion_common::Result<&mut String> { + match arr.data_type() { + DataType::List(..) => { + let list_array = as_list_array(&arr)?; + for i in 0..list_array.len() { + compute_array_to_string( + arg, + list_array.value(i), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + } + + Ok(arg) + } + DataType::LargeList(..) => { + let list_array = as_large_list_array(&arr)?; + for i in 0..list_array.len() { + compute_array_to_string( + arg, + list_array.value(i), + delimiter.clone(), + null_string.clone(), + with_null_string, + )?; + } + + Ok(arg) + } + DataType::Null => Ok(arg), + data_type => { + macro_rules! array_function { + ($ARRAY_TYPE:ident) => { + to_string!( + arg, + arr, + &delimiter, + &null_string, + with_null_string, + $ARRAY_TYPE + ) + }; + } + call_array_function!(data_type, false) + } + } + } + + fn generate_string_array( + list_arr: &GenericListArray, + delimiters: Vec>, + null_string: String, + with_null_string: bool, + ) -> datafusion_common::Result { + let mut res: Vec> = Vec::new(); + for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) { + if let (Some(arr), Some(delimiter)) = (arr, delimiter) { + let mut arg = String::from(""); + let s = compute_array_to_string( + &mut arg, + arr, + delimiter.to_string(), + null_string.clone(), + with_null_string, + )? + .clone(); + + if let Some(s) = s.strip_suffix(delimiter) { + res.push(Some(s.to_string())); + } else { + res.push(Some(s)); + } + } else { + res.push(None); + } + } + + Ok(StringArray::from(res)) + } + + let arr_type = arr.data_type(); + let string_arr = match arr_type { + DataType::List(_) | DataType::FixedSizeList(_, _) => { + let list_array = as_list_array(&arr)?; + generate_string_array::( + list_array, + delimiters, + null_string, + with_null_string, + )? + } + DataType::LargeList(_) => { + let list_array = as_large_list_array(&arr)?; + generate_string_array::( + list_array, + delimiters, + null_string, + with_null_string, + )? + } + _ => { + let mut arg = String::from(""); + let mut res: Vec> = Vec::new(); + // delimiter length is 1 + assert_eq!(delimiters.len(), 1); + let delimiter = delimiters[0].unwrap(); + let s = compute_array_to_string( + &mut arg, + arr.clone(), + delimiter.to_string(), + null_string, + with_null_string, + )? + .clone(); + + if !s.is_empty() { + let s = s.strip_suffix(delimiter).unwrap().to_string(); + res.push(Some(s)); + } else { + res.push(Some(s)); + } + StringArray::from(res) + } + }; + + Ok(Arc::new(string_arr)) +} diff --git a/datafusion/functions-array/src/lib.rs b/datafusion/functions-array/src/lib.rs new file mode 100644 index 000000000000..84997ed10e32 --- /dev/null +++ b/datafusion/functions-array/src/lib.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Array Functions for [DataFusion]. +//! +//! This crate contains a collection of array functions implemented using the +//! extension API. +//! +//! [DataFusion]: https://crates.io/crates/datafusion +//! +//! You can register the functions in this crate using the [`register_all`] function. +//! + +#[macro_use] +pub mod macros; + +mod kernels; +mod udf; + +use datafusion_common::Result; +use datafusion_execution::FunctionRegistry; +use datafusion_expr::ScalarUDF; +use log::debug; +use std::sync::Arc; + +/// Fluent-style API for creating `Expr`s +pub mod expr_fn { + pub use super::udf::array_to_string; +} + +/// Registers all enabled packages with a [`FunctionRegistry`] +pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { + let functions: Vec> = vec![udf::array_to_string_udf()]; + functions.into_iter().try_for_each(|udf| { + let existing_udf = registry.register_udf(udf)?; + if let Some(existing_udf) = existing_udf { + debug!("Overwrite existing UDF: {}", existing_udf.name()); + } + Ok(()) as Result<()> + })?; + Ok(()) +} diff --git a/datafusion/functions-array/src/macros.rs b/datafusion/functions-array/src/macros.rs new file mode 100644 index 000000000000..c503fde05b18 --- /dev/null +++ b/datafusion/functions-array/src/macros.rs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Creates external API functions for an array UDF. Specifically, creates +/// +/// 1. Single `ScalarUDF` instance +/// +/// Creates a singleton `ScalarUDF` of the `$UDF` function named `$GNAME` and a +/// function named `$NAME` which returns that function named $NAME. +/// +/// This is used to ensure creating the list of `ScalarUDF` only happens once. +/// +/// # 2. `expr_fn` style function +/// +/// These are functions that create an `Expr` that invokes the UDF, used +/// primarily to programmatically create expressions. +/// +/// For example: +/// ```text +/// pub fn array_to_string(delimiter: Expr) -> Expr { +/// ... +/// } +/// ``` +/// # Arguments +/// * `UDF`: name of the [`ScalarUDFImpl`] +/// * `EXPR_FN`: name of the expr_fn function to be created +/// * `arg`: 0 or more named arguments for the function +/// * `DOC`: documentation string for the function +/// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF` +/// * `GNAME`: name for the single static instance of the `ScalarUDF` +/// +/// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl +macro_rules! make_udf_function { + ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $SCALAR_UDF_FN:ident) => { + paste::paste! { + // "fluent expr_fn" style function + #[doc = $DOC] + pub fn $EXPR_FN($($arg: Expr),*) -> Expr { + Expr::ScalarFunction(ScalarFunction::new_udf( + $SCALAR_UDF_FN(), + vec![$($arg),*], + )) + } + + /// Singleton instance of [`$UDF`], ensures the UDF is only created once + /// named STATIC_$(UDF). For example `STATIC_ArrayToString` + #[allow(non_upper_case_globals)] + static [< STATIC_ $UDF >]: std::sync::OnceLock> = + std::sync::OnceLock::new(); + + /// ScalarFunction that returns a [`ScalarUDF`] for [`$UDF`] + /// + /// [`ScalarUDF`]: datafusion_expr::ScalarUDF + pub fn $SCALAR_UDF_FN() -> std::sync::Arc { + [< STATIC_ $UDF >] + .get_or_init(|| { + std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( + <$UDF>::new(), + )) + }) + .clone() + } + } + }; +} diff --git a/datafusion/functions-array/src/udf.rs b/datafusion/functions-array/src/udf.rs new file mode 100644 index 000000000000..b7f9d2497fb7 --- /dev/null +++ b/datafusion/functions-array/src/udf.rs @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ScalarUDFImpl`] definitions for array functions. + +use arrow::datatypes::DataType; +use datafusion_common::{plan_err, DataFusionError}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::Expr; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; + +// Create static instances of ScalarUDFs for each function +make_udf_function!(ArrayToString, + array_to_string, + array delimiter, // arg name + "converts each element to its text representation.", // doc + array_to_string_udf // internal function name +); + +#[derive(Debug)] +pub(super) struct ArrayToString { + signature: Signature, + aliases: Vec, +} + +impl ArrayToString { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![ + String::from("array_to_string"), + String::from("list_to_string"), + String::from("array_join"), + String::from("list_join"), + ], + } + } +} + +impl ScalarUDFImpl for ArrayToString { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_to_string" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> datafusion_common::Result { + use DataType::*; + Ok(match arg_types[0] { + List(_) | LargeList(_) | FixedSizeList(_, _) => Utf8, + _ => { + return plan_err!("The array_to_string function can only accept List/LargeList/FixedSizeList."); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion_common::Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_to_string(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 0709e66a35c9..0468b3e54294 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -229,46 +229,6 @@ fn check_datatypes(name: &str, args: &[&ArrayRef]) -> Result<()> { Ok(()) } -macro_rules! call_array_function { - ($DATATYPE:expr, false) => { - match $DATATYPE { - DataType::Utf8 => array_function!(StringArray), - DataType::LargeUtf8 => array_function!(LargeStringArray), - DataType::Boolean => array_function!(BooleanArray), - DataType::Float32 => array_function!(Float32Array), - DataType::Float64 => array_function!(Float64Array), - DataType::Int8 => array_function!(Int8Array), - DataType::Int16 => array_function!(Int16Array), - DataType::Int32 => array_function!(Int32Array), - DataType::Int64 => array_function!(Int64Array), - DataType::UInt8 => array_function!(UInt8Array), - DataType::UInt16 => array_function!(UInt16Array), - DataType::UInt32 => array_function!(UInt32Array), - DataType::UInt64 => array_function!(UInt64Array), - _ => unreachable!(), - } - }; - ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{ - match $DATATYPE { - DataType::List(_) => array_function!(ListArray), - DataType::Utf8 => array_function!(StringArray), - DataType::LargeUtf8 => array_function!(LargeStringArray), - DataType::Boolean => array_function!(BooleanArray), - DataType::Float32 => array_function!(Float32Array), - DataType::Float64 => array_function!(Float64Array), - DataType::Int8 => array_function!(Int8Array), - DataType::Int16 => array_function!(Int16Array), - DataType::Int32 => array_function!(Int32Array), - DataType::Int64 => array_function!(Int64Array), - DataType::UInt8 => array_function!(UInt8Array), - DataType::UInt16 => array_function!(UInt16Array), - DataType::UInt32 => array_function!(UInt32Array), - DataType::UInt64 => array_function!(UInt64Array), - _ => unreachable!(), - } - }}; -} - /// Convert one or more [`ArrayRef`] of the same type into a /// `ListArray` or 'LargeListArray' depending on the offset size. /// @@ -1870,27 +1830,6 @@ pub fn array_replace_all(args: &[ArrayRef]) -> Result { } } -macro_rules! to_string { - ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{ - let arr = downcast_arg!($ARRAY, $ARRAY_TYPE); - for x in arr { - match x { - Some(x) => { - $ARG.push_str(&x.to_string()); - $ARG.push_str($DELIMITER); - } - None => { - if $WITH_NULL_STRING { - $ARG.push_str($NULL_STRING); - $ARG.push_str($DELIMITER); - } - } - } - } - Ok($ARG) - }}; -} - #[derive(Debug, PartialEq)] enum SetOp { Union, @@ -2058,159 +1997,6 @@ pub fn array_intersect(args: &[ArrayRef]) -> Result { general_set_op(array1, array2, SetOp::Intersect) } -/// Array_to_string SQL function -pub fn array_to_string(args: &[ArrayRef]) -> Result { - if args.len() < 2 || args.len() > 3 { - return exec_err!("array_to_string expects two or three arguments"); - } - - let arr = &args[0]; - - let delimiters = as_string_array(&args[1])?; - let delimiters: Vec> = delimiters.iter().collect(); - - let mut null_string = String::from(""); - let mut with_null_string = false; - if args.len() == 3 { - null_string = as_string_array(&args[2])?.value(0).to_string(); - with_null_string = true; - } - - fn compute_array_to_string( - arg: &mut String, - arr: ArrayRef, - delimiter: String, - null_string: String, - with_null_string: bool, - ) -> Result<&mut String> { - match arr.data_type() { - DataType::List(..) => { - let list_array = as_list_array(&arr)?; - for i in 0..list_array.len() { - compute_array_to_string( - arg, - list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; - } - - Ok(arg) - } - DataType::LargeList(..) => { - let list_array = as_large_list_array(&arr)?; - for i in 0..list_array.len() { - compute_array_to_string( - arg, - list_array.value(i), - delimiter.clone(), - null_string.clone(), - with_null_string, - )?; - } - - Ok(arg) - } - DataType::Null => Ok(arg), - data_type => { - macro_rules! array_function { - ($ARRAY_TYPE:ident) => { - to_string!( - arg, - arr, - &delimiter, - &null_string, - with_null_string, - $ARRAY_TYPE - ) - }; - } - call_array_function!(data_type, false) - } - } - } - - fn generate_string_array( - list_arr: &GenericListArray, - delimiters: Vec>, - null_string: String, - with_null_string: bool, - ) -> Result { - let mut res: Vec> = Vec::new(); - for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) { - if let (Some(arr), Some(delimiter)) = (arr, delimiter) { - let mut arg = String::from(""); - let s = compute_array_to_string( - &mut arg, - arr, - delimiter.to_string(), - null_string.clone(), - with_null_string, - )? - .clone(); - - if let Some(s) = s.strip_suffix(delimiter) { - res.push(Some(s.to_string())); - } else { - res.push(Some(s)); - } - } else { - res.push(None); - } - } - - Ok(StringArray::from(res)) - } - - let arr_type = arr.data_type(); - let string_arr = match arr_type { - DataType::List(_) | DataType::FixedSizeList(_, _) => { - let list_array = as_list_array(&arr)?; - generate_string_array::( - list_array, - delimiters, - null_string, - with_null_string, - )? - } - DataType::LargeList(_) => { - let list_array = as_large_list_array(&arr)?; - generate_string_array::( - list_array, - delimiters, - null_string, - with_null_string, - )? - } - _ => { - let mut arg = String::from(""); - let mut res: Vec> = Vec::new(); - // delimiter length is 1 - assert_eq!(delimiters.len(), 1); - let delimiter = delimiters[0].unwrap(); - let s = compute_array_to_string( - &mut arg, - arr.clone(), - delimiter.to_string(), - null_string, - with_null_string, - )? - .clone(); - - if !s.is_empty() { - let s = s.strip_suffix(delimiter).unwrap().to_string(); - res.push(Some(s)); - } else { - res.push(Some(s)); - } - StringArray::from(res) - } - }; - - Ok(Arc::new(string_arr)) -} - /// Cardinality SQL function pub fn cardinality(args: &[ArrayRef]) -> Result { if args.len() != 1 { diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index cbd780a8fb32..bd5c1677b0ca 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -411,9 +411,6 @@ pub fn create_physical_fun( BuiltinScalarFunction::ArraySlice => Arc::new(|args| { make_scalar_function_inner(array_expressions::array_slice)(args) }), - BuiltinScalarFunction::ArrayToString => Arc::new(|args| { - make_scalar_function_inner(array_expressions::array_to_string)(args) - }), BuiltinScalarFunction::ArrayIntersect => Arc::new(|args| { make_scalar_function_inner(array_expressions::array_intersect)(args) }), diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 667e53842e56..898136dbe4ac 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -641,7 +641,7 @@ enum ScalarFunction { ArrayPrepend = 94; ArrayRemove = 95; ArrayReplace = 96; - ArrayToString = 97; + // 97 was ArrayToString Cardinality = 98; ArrayElement = 99; ArraySlice = 100; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 5b7d27d0dff0..f49c7ff8ca75 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22400,7 +22400,6 @@ impl serde::Serialize for ScalarFunction { Self::ArrayPrepend => "ArrayPrepend", Self::ArrayRemove => "ArrayRemove", Self::ArrayReplace => "ArrayReplace", - Self::ArrayToString => "ArrayToString", Self::Cardinality => "Cardinality", Self::ArrayElement => "ArrayElement", Self::ArraySlice => "ArraySlice", @@ -22544,7 +22543,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ArrayPrepend", "ArrayRemove", "ArrayReplace", - "ArrayToString", "Cardinality", "ArrayElement", "ArraySlice", @@ -22717,7 +22715,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "ArrayPrepend" => Ok(ScalarFunction::ArrayPrepend), "ArrayRemove" => Ok(ScalarFunction::ArrayRemove), "ArrayReplace" => Ok(ScalarFunction::ArrayReplace), - "ArrayToString" => Ok(ScalarFunction::ArrayToString), "Cardinality" => Ok(ScalarFunction::Cardinality), "ArrayElement" => Ok(ScalarFunction::ArrayElement), "ArraySlice" => Ok(ScalarFunction::ArraySlice), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index cdf4dadcf894..649b3a29d70e 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2728,7 +2728,7 @@ pub enum ScalarFunction { ArrayPrepend = 94, ArrayRemove = 95, ArrayReplace = 96, - ArrayToString = 97, + /// 97 was ArrayToString Cardinality = 98, ArrayElement = 99, ArraySlice = 100, @@ -2869,7 +2869,6 @@ impl ScalarFunction { ScalarFunction::ArrayPrepend => "ArrayPrepend", ScalarFunction::ArrayRemove => "ArrayRemove", ScalarFunction::ArrayReplace => "ArrayReplace", - ScalarFunction::ArrayToString => "ArrayToString", ScalarFunction::Cardinality => "Cardinality", ScalarFunction::ArrayElement => "ArrayElement", ScalarFunction::ArraySlice => "ArraySlice", @@ -3007,7 +3006,6 @@ impl ScalarFunction { "ArrayPrepend" => Some(Self::ArrayPrepend), "ArrayRemove" => Some(Self::ArrayRemove), "ArrayReplace" => Some(Self::ArrayReplace), - "ArrayToString" => Some(Self::ArrayToString), "Cardinality" => Some(Self::Cardinality), "ArrayElement" => Some(Self::ArrayElement), "ArraySlice" => Some(Self::ArraySlice), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 0689da803538..7a487bd86759 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -52,11 +52,10 @@ use datafusion_expr::{ array_intersect, array_length, array_ndims, array_pop_back, array_pop_front, array_position, array_positions, array_prepend, array_remove, array_remove_all, array_remove_n, array_repeat, array_replace, array_replace_all, array_replace_n, - array_resize, array_slice, array_sort, array_to_string, array_union, arrow_typeof, - ascii, asin, asinh, atan, atan2, atanh, bit_length, btrim, cardinality, cbrt, ceil, - character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, - current_date, current_time, date_bin, date_part, date_trunc, degrees, digest, - ends_with, exp, + array_resize, array_slice, array_sort, array_union, arrow_typeof, ascii, asin, asinh, + atan, atan2, atanh, bit_length, btrim, cardinality, cbrt, ceil, character_length, + chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, current_date, + current_time, date_bin, date_part, date_trunc, degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, flatten, floor, from_unixtime, gcd, gen_range, initcap, instr, isnan, iszero, lcm, left, levenshtein, ln, log, log10, log2, @@ -507,7 +506,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::ArrayReplaceAll => Self::ArrayReplaceAll, ScalarFunction::ArrayReverse => Self::ArrayReverse, ScalarFunction::ArraySlice => Self::ArraySlice, - ScalarFunction::ArrayToString => Self::ArrayToString, ScalarFunction::ArrayIntersect => Self::ArrayIntersect, ScalarFunction::ArrayUnion => Self::ArrayUnion, ScalarFunction::ArrayResize => Self::ArrayResize, @@ -1462,10 +1460,6 @@ pub fn parse_expr( parse_expr(&args[2], registry)?, parse_expr(&args[3], registry)?, )), - ScalarFunction::ArrayToString => Ok(array_to_string( - parse_expr(&args[0], registry)?, - parse_expr(&args[1], registry)?, - )), ScalarFunction::Range => Ok(gen_range( args.to_owned() .iter() diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 4df7f9fb6bf3..10afd9e59e0a 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1486,7 +1486,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::ArrayReplaceAll => Self::ArrayReplaceAll, BuiltinScalarFunction::ArrayReverse => Self::ArrayReverse, BuiltinScalarFunction::ArraySlice => Self::ArraySlice, - BuiltinScalarFunction::ArrayToString => Self::ArrayToString, BuiltinScalarFunction::ArrayIntersect => Self::ArrayIntersect, BuiltinScalarFunction::ArrayUnion => Self::ArrayUnion, BuiltinScalarFunction::Range => Self::Range, From 97a789abb2619fb5d2afb0f2203448f437183535 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Feb 2024 09:05:36 -0500 Subject: [PATCH 2/2] Add test for round tripping array_to_string --- .../proto/tests/cases/roundtrip_logical_plan.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 652e59672bc7..b6d288da2c3e 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -567,13 +567,15 @@ async fn roundtrip_expr_api() -> Result<()> { let table = ctx.table("t1").await?; let schema = table.schema().clone(); + // list of expressions to round trip + let expr_list = vec![ + encode(col("a").cast_to(&DataType::Utf8, &schema)?, lit("hex")), + decode(lit("1234"), lit("hex")), + array_to_string(array(vec![lit(1), lit(2), lit(3)]), lit(",")), + ]; + // ensure expressions created with the expr api can be round tripped - let plan = table - .select(vec![ - encode(col("a").cast_to(&DataType::Utf8, &schema)?, lit("hex")), - decode(lit("1234"), lit("hex")), - ])? - .into_optimized_plan()?; + let plan = table.select(expr_list)?.into_optimized_plan()?; let bytes = logical_plan_to_bytes(&plan)?; let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}"));