From 0eaf2891595ba626226d0f2c8ae2192571a98c01 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 3 Apr 2024 09:31:34 +0800 Subject: [PATCH 01/38] backup Signed-off-by: jayzhan211 --- Cargo.toml | 2 + datafusion/aggregate-functions/Cargo.toml | 61 +++++++++ .../aggregate-functions/src/first_last.rs | 117 ++++++++++++++++++ datafusion/aggregate-functions/src/lib.rs | 109 ++++++++++++++++ datafusion/aggregate-functions/src/macros.rs | 50 ++++++++ datafusion/aggregate-functions/src/utils.rs | 16 +++ datafusion/common/src/utils.rs | 5 + datafusion/core/src/execution/context/mod.rs | 2 +- datafusion/expr/src/expr_fn.rs | 20 +-- 9 files changed, 371 insertions(+), 11 deletions(-) create mode 100644 datafusion/aggregate-functions/Cargo.toml create mode 100644 datafusion/aggregate-functions/src/first_last.rs create mode 100644 datafusion/aggregate-functions/src/lib.rs create mode 100644 datafusion/aggregate-functions/src/macros.rs create mode 100644 datafusion/aggregate-functions/src/utils.rs diff --git a/Cargo.toml b/Cargo.toml index 9df489724d46..acedb3d6c92a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "datafusion/core", "datafusion/expr", "datafusion/execution", + "datafusion/aggregate-functions", "datafusion/functions", "datafusion/functions-array", "datafusion/optimizer", @@ -76,6 +77,7 @@ datafusion-common = { path = "datafusion/common", version = "37.0.0", default-fe datafusion-common-runtime = { path = "datafusion/common-runtime", version = "37.0.0" } datafusion-execution = { path = "datafusion/execution", version = "37.0.0" } datafusion-expr = { path = "datafusion/expr", version = "37.0.0" } +datafusion-aggregate-functions = { path = "datafusion/aggregate-functions", version = "37.0.0" } datafusion-functions = { path = "datafusion/functions", version = "37.0.0" } datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" } datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false } diff --git a/datafusion/aggregate-functions/Cargo.toml b/datafusion/aggregate-functions/Cargo.toml new file mode 100644 index 000000000000..a733b4e8142c --- /dev/null +++ b/datafusion/aggregate-functions/Cargo.toml @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-aggregate-functions" +description = "Aggregate function packages for the DataFusion query engine" +keywords = ["datafusion", "logical", "plan", "expressions"] +readme = "README.md" +version = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "datafusion_aggregate_functions" +path = "src/lib.rs" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow = { workspace = true } +base64 = { version = "0.22", optional = true } +blake2 = { version = "^0.10.2", optional = true } +blake3 = { version = "1.0", optional = true } +chrono = { workspace = true } +datafusion-common = { workspace = true } +datafusion-execution = { workspace = true } +datafusion-expr = { workspace = true } +datafusion-physical-expr = { workspace = true, default-features = true } +hashbrown = { version = "0.14", features = ["raw"], optional = true } +hex = { version = "0.4", optional = true } +itertools = { workspace = true } +log = { workspace = true } +md-5 = { version = "^0.10.0", optional = true } +regex = { version = "1.8", optional = true } +sha2 = { version = "^0.10.1", optional = true } +unicode-segmentation = { version = "^1.7.1", optional = true } +uuid = { version = "1.7", features = ["v4"], optional = true } + +[dev-dependencies] +criterion = "0.5" +rand = { workspace = true } +rstest = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt", "sync"] } \ No newline at end of file diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs new file mode 100644 index 000000000000..e09089c5abe7 --- /dev/null +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -0,0 +1,117 @@ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field}; +use datafusion_common::utils::format_state_name; +use datafusion_common::Result; +use datafusion_expr::function::AccumulatorArgs; +use datafusion_expr::type_coercion::aggregates::NUMERICS; +use datafusion_expr::{Accumulator, AccumulatorFactoryFunction, AggregateUDF, AggregateUDFImpl, Signature, Volatility}; +use std::any::Any; +use std::fmt::Debug; + +make_udf_function!( + FirstValue, + first_value, + value: Expr, + "Returns the first value in a group of values.", + first_value_fn +); + + +pub struct FirstValue { + signature: Signature, + aliases: Vec, + accumulator: AccumulatorFactoryFunction, +} + +impl Debug for FirstValue { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("FirstValue") + .field("name", &self.name()) + .field("signature", &self.signature) + .field("accumulator", &"") + .finish() + } +} + +impl FirstValue { + pub fn new( + accumulator: AccumulatorFactoryFunction, + ) -> Self { + Self { + aliases: vec![ + String::from("FIRST_VALUE"), + ], + signature: Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable), + accumulator, + } + } +} + +impl AggregateUDFImpl for FirstValue { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "FIRST_VALUE" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(arg_types[0].clone()) + } + + fn accumulator( + &self, + acc_args: AccumulatorArgs, + ) -> Result> { + (self.accumulator)(acc_args) + } + + fn state_fields( + &self, + name: &str, + value_type: DataType, + ordering_fields: Vec, + ) -> Result> { + let mut fields = vec![Field::new( + format_state_name(name, "first_value"), + value_type, + true, + )]; + fields.extend(ordering_fields); + fields.push(Field::new("is_set", DataType::Boolean, true)); + Ok(fields) + } +} + +/// Creates a new UDAF with a specific signature, state type and return type. +/// The signature and state type must match the `Accumulator's implementation`. +/// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then. +pub fn create_first_value( + name: &str, + signature: Signature, + accumulator: AccumulatorFactoryFunction, +) -> AggregateUDF { + AggregateUDF::from(FirstValue::new(accumulator)) +} \ No newline at end of file diff --git a/datafusion/aggregate-functions/src/lib.rs b/datafusion/aggregate-functions/src/lib.rs new file mode 100644 index 000000000000..6435022eb60b --- /dev/null +++ b/datafusion/aggregate-functions/src/lib.rs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Function packages for [DataFusion]. +//! +//! This crate contains a collection of various function packages for DataFusion, +//! implemented using the extension API. Users may wish to control which functions +//! are available to control the binary size of their application as well as +//! use dialect specific implementations of functions (e.g. Spark vs Postgres) +//! +//! Each package is implemented as a separate +//! module, activated by a feature flag. +//! +//! [DataFusion]: https://crates.io/crates/datafusion +//! +//! # Available Packages +//! See the list of [modules](#modules) in this crate for available packages. +//! +//! # Using A Package +//! You can register all functions in all packages using the [`register_all`] function. +//! +//! To access and use only the functions in a certain package, use the +//! `functions()` method in each module. +//! +//! ``` +//! # fn main() -> datafusion_common::Result<()> { +//! # let mut registry = datafusion_execution::registry::MemoryFunctionRegistry::new(); +//! # use datafusion_execution::FunctionRegistry; +//! // get the encoding functions +//! use datafusion_functions::encoding; +//! for udf in encoding::functions() { +//! registry.register_udf(udf)?; +//! } +//! # Ok(()) +//! # } +//! ``` +//! +//! Each package also exports an `expr_fn` submodule to help create [`Expr`]s that invoke +//! functions using a fluent style. For example: +//! +//! ``` +//! // create an Expr that will invoke the encode function +//! use datafusion_expr::{col, lit}; +//! use datafusion_functions::expr_fn; +//! // Equivalent to "encode(my_data, 'hex')" in SQL: +//! let expr = expr_fn::encode(col("my_data"), lit("hex")); +//! ``` +//! +//![`Expr`]: datafusion_expr::Expr +//! +//! # Implementing A New Package +//! +//! To add a new package to this crate, you should follow the model of existing +//! packages. The high level steps are: +//! +//! 1. Create a new module with the appropriate [`ScalarUDF`] implementations. +//! +//! 2. Use the macros in [`macros`] to create standard entry points. +//! +//! 3. Add a new feature to `Cargo.toml`, with any optional dependencies +//! +//! 4. Use the `make_package!` macro to expose the module when the +//! feature is enabled. +//! +//! [`ScalarUDF`]: datafusion_expr::ScalarUDF +use std::sync::Arc; + +use datafusion_common::Result; +use datafusion_execution::FunctionRegistry; +use datafusion_expr::AggregateUDF; +use log::debug; + +pub mod first_last; +pub mod utils; + +#[macro_use] +pub mod macros; + +/// Fluent-style API for creating `Expr`s +pub mod expr_fn {} + +/// Registers all enabled packages with a [`FunctionRegistry`] +pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { + let mut functions: Vec> = vec![]; + + functions.into_iter().try_for_each(|udf| { + let existing_udaf = registry.register_udaf(udf)?; + if let Some(existing_udaf) = existing_udaf { + debug!("Overwrite existing UDAF: {}", existing_udaf.name()); + } + Ok(()) as Result<()> + })?; + + Ok(()) +} diff --git a/datafusion/aggregate-functions/src/macros.rs b/datafusion/aggregate-functions/src/macros.rs new file mode 100644 index 000000000000..dc7d4a3babcb --- /dev/null +++ b/datafusion/aggregate-functions/src/macros.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +macro_rules! make_udf_function { + ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $AGGREGATE_UDF_FN:ident) => { + paste::paste! { + // "fluent expr_fn" style function + #[doc = $DOC] + pub fn $EXPR_FN($($arg: Expr),*) -> Expr { + Expr::ScalarFunction(ScalarFunction::new_udf( + $AGGREGATE_UDF_FN(), + vec![$($arg),*], + )) + } + + /// Singleton instance of [`$UDF`], ensures the UDF is only created once + /// named STATIC_$(UDF). For example `STATIC_ArrayToString` + #[allow(non_upper_case_globals)] + static [< STATIC_ $UDF >]: std::sync::OnceLock> = + std::sync::OnceLock::new(); + + /// ScalarFunction that returns a [`ScalarUDF`] for [`$UDF`] + /// + /// [`ScalarUDF`]: datafusion_expr::ScalarUDF + pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { + [< STATIC_ $UDF >] + .get_or_init(|| { + std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( + <$UDF>::new(), + )) + }) + .clone() + } + } + } +} \ No newline at end of file diff --git a/datafusion/aggregate-functions/src/utils.rs b/datafusion/aggregate-functions/src/utils.rs new file mode 100644 index 000000000000..b248758bc120 --- /dev/null +++ b/datafusion/aggregate-functions/src/utils.rs @@ -0,0 +1,16 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. diff --git a/datafusion/common/src/utils.rs b/datafusion/common/src/utils.rs index 3296e68d17da..e09684e36524 100644 --- a/datafusion/common/src/utils.rs +++ b/datafusion/common/src/utils.rs @@ -679,6 +679,11 @@ pub fn find_indices>( .ok_or_else(|| DataFusionError::Execution("Target not found".to_string())) } +/// Construct state name. State is the intermidiate state of the aggregate function. +pub fn format_state_name(name: &str, state_name: &str) -> String { + format!("{name}[{state_name}]") +} + #[cfg(test)] mod tests { use crate::ScalarValue; diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 4eaaf94ecf5d..c014842cb057 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -70,7 +70,7 @@ use datafusion_common::{ }; use datafusion_execution::registry::SerializerRegistry; use datafusion_expr::type_coercion::aggregates::NUMERICS; -use datafusion_expr::{create_first_value, Signature, Volatility}; +use datafusion_expr::{Signature, Volatility}; use datafusion_expr::{ logical_plan::{DdlStatement, Statement}, var_provider::is_system_variables, diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 5294ca754532..a65c9d54560d 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -712,16 +712,16 @@ pub fn create_udaf( )) } -/// Creates a new UDAF with a specific signature, state type and return type. -/// The signature and state type must match the `Accumulator's implementation`. -/// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then. -pub fn create_first_value( - name: &str, - signature: Signature, - accumulator: AccumulatorFactoryFunction, -) -> AggregateUDF { - AggregateUDF::from(FirstValue::new(name, signature, accumulator)) -} +// /// Creates a new UDAF with a specific signature, state type and return type. +// /// The signature and state type must match the `Accumulator's implementation`. +// /// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then. +// pub fn create_first_value( +// name: &str, +// signature: Signature, +// accumulator: AccumulatorFactoryFunction, +// ) -> AggregateUDF { +// AggregateUDF::from(FirstValue::new(name, signature, accumulator)) +// } /// Implements [`AggregateUDFImpl`] for functions that have a single signature and /// return type. From 5338f61499e14bef5f48d34138d813ad412a2e54 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 3 Apr 2024 21:53:16 +0800 Subject: [PATCH 02/38] move PhysicalExpr Signed-off-by: jayzhan211 --- Cargo.toml | 2 + datafusion/physical-expr-core/Cargo.toml | 81 ++++++ datafusion/physical-expr-core/src/lib.rs | 38 +++ .../physical-expr-core/src/physical_expr.rs | 255 ++++++++++++++++++ .../physical-expr-core/src/sort_expr.rs | 16 ++ .../src/sort_properties.rs | 0 .../src/tree_node.rs | 0 datafusion/physical-expr-core/src/utils.rs | 142 ++++++++++ datafusion/physical-expr/Cargo.toml | 1 + datafusion/physical-expr/src/lib.rs | 19 +- datafusion/physical-expr/src/physical_expr.rs | 236 +--------------- datafusion/physical-expr/src/utils/mod.rs | 116 +------- 12 files changed, 554 insertions(+), 352 deletions(-) create mode 100644 datafusion/physical-expr-core/Cargo.toml create mode 100644 datafusion/physical-expr-core/src/lib.rs create mode 100644 datafusion/physical-expr-core/src/physical_expr.rs create mode 100644 datafusion/physical-expr-core/src/sort_expr.rs rename datafusion/{physical-expr => physical-expr-core}/src/sort_properties.rs (100%) rename datafusion/{physical-expr => physical-expr-core}/src/tree_node.rs (100%) create mode 100644 datafusion/physical-expr-core/src/utils.rs diff --git a/Cargo.toml b/Cargo.toml index 9df489724d46..64c228c89870 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ members = [ "datafusion/functions", "datafusion/functions-array", "datafusion/optimizer", + "datafusion/physical-expr-core", "datafusion/physical-expr", "datafusion/physical-plan", "datafusion/proto", @@ -80,6 +81,7 @@ datafusion-functions = { path = "datafusion/functions", version = "37.0.0" } datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" } datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false } datafusion-physical-expr = { path = "datafusion/physical-expr", version = "37.0.0", default-features = false } +datafusion-physical-expr-core = { path = "datafusion/physical-expr-core", version = "37.0.0", default-features = false } datafusion-physical-plan = { path = "datafusion/physical-plan", version = "37.0.0" } datafusion-proto = { path = "datafusion/proto", version = "37.0.0" } datafusion-sql = { path = "datafusion/sql", version = "37.0.0" } diff --git a/datafusion/physical-expr-core/Cargo.toml b/datafusion/physical-expr-core/Cargo.toml new file mode 100644 index 000000000000..b67bf20b303c --- /dev/null +++ b/datafusion/physical-expr-core/Cargo.toml @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-physical-expr-core" +description = "Core physical expression implementation for DataFusion query engine" +keywords = ["arrow", "query", "sql"] +readme = "README.md" +version = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "datafusion_physical_expr_core" +path = "src/lib.rs" + +# TODO: Remove unused features + +[features] +crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] +default = [ + "crypto_expressions", + "regex_expressions", + "encoding_expressions", +] +encoding_expressions = ["base64", "hex"] +regex_expressions = ["regex"] + +[dependencies] +ahash = { version = "0.8", default-features = false, features = [ + "runtime-rng", +] } +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-ord = { workspace = true } +arrow-schema = { workspace = true } +arrow-string = { workspace = true } +base64 = { version = "0.22", optional = true } +blake2 = { version = "^0.10.2", optional = true } +blake3 = { version = "1.0", optional = true } +chrono = { workspace = true } +datafusion-common = { workspace = true, default-features = true } +datafusion-execution = { workspace = true } +datafusion-expr = { workspace = true } +half = { workspace = true } +hashbrown = { version = "0.14", features = ["raw"] } +hex = { version = "0.4", optional = true } +indexmap = { workspace = true } +itertools = { workspace = true, features = ["use_std"] } +log = { workspace = true } +md-5 = { version = "^0.10.0", optional = true } +paste = "^1.0" +petgraph = "0.6.2" +rand = { workspace = true } +regex = { version = "1.8", optional = true } +sha2 = { version = "^0.10.1", optional = true } + +[dev-dependencies] +criterion = "0.5" +rand = { workspace = true } +rstest = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread"] } \ No newline at end of file diff --git a/datafusion/physical-expr-core/src/lib.rs b/datafusion/physical-expr-core/src/lib.rs new file mode 100644 index 000000000000..6d22f9818818 --- /dev/null +++ b/datafusion/physical-expr-core/src/lib.rs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod physical_expr; +pub mod sort_properties; +pub mod tree_node; +pub mod utils; + +mod sort_expr; + +// backwards compatibility +pub mod execution_props { + pub use datafusion_expr::execution_props::ExecutionProps; + pub use datafusion_expr::var_provider::{VarProvider, VarType}; +} + +// pub use physical_expr::{ +// physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, +// PhysicalExpr, PhysicalExprRef, +// }; +// pub use sort_expr::{ +// LexOrdering, LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalSortExpr, +// PhysicalSortRequirement, +// }; diff --git a/datafusion/physical-expr-core/src/physical_expr.rs b/datafusion/physical-expr-core/src/physical_expr.rs new file mode 100644 index 000000000000..b6a1153c1cda --- /dev/null +++ b/datafusion/physical-expr-core/src/physical_expr.rs @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::fmt::{Debug, Display}; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; + +use arrow::array::BooleanArray; +use arrow::compute::filter_record_batch; +use arrow::datatypes::{DataType, Schema}; +use arrow::record_batch::RecordBatch; +use datafusion_common::utils::DataPtr; +use datafusion_common::{internal_err, not_impl_err, Result}; +use datafusion_expr::interval_arithmetic::Interval; +use datafusion_expr::ColumnarValue; +use itertools::izip; + +use crate::sort_properties::SortProperties; +use crate::utils::scatter; + +/// `PhysicalExpr` evaluate DataFusion expressions such as `A + 1`, or `CAST(c1 +/// AS int)`. +/// +/// `PhysicalExpr` are the physical counterpart to [`Expr`] used in logical +/// planning, and can be evaluated directly on a [`RecordBatch`]. They are +/// normally created from `Expr` by a [`PhysicalPlanner`] and can be created +/// directly using [`create_physical_expr`]. +/// +/// A Physical expression knows its type, nullability and how to evaluate itself. +/// +/// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html +/// [`create_physical_expr`]: crate::create_physical_expr +/// [`Expr`]: datafusion_expr::Expr +/// +/// # Example: Create `PhysicalExpr` from `Expr` +/// ``` +/// # use arrow_schema::{DataType, Field, Schema}; +/// # use datafusion_common::DFSchema; +/// # use datafusion_expr::{Expr, col, lit}; +/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_expr::execution_props::ExecutionProps; +/// // For a logical expression `a = 1`, we can create a physical expression +/// let expr = col("a").eq(lit(1)); +/// // To create a PhysicalExpr we need 1. a schema +/// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); +/// let df_schema = DFSchema::try_from(schema).unwrap(); +/// // 2. ExecutionProps +/// let props = ExecutionProps::new(); +/// // We can now create a PhysicalExpr: +/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); +/// ``` +/// +/// # Example: Executing a PhysicalExpr to obtain [`ColumnarValue`] +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; +/// # use arrow_schema::{DataType, Field, Schema}; +/// # use datafusion_common::{assert_batches_eq, DFSchema}; +/// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; +/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_expr::execution_props::ExecutionProps; +/// # let expr = col("a").eq(lit(1)); +/// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); +/// # let df_schema = DFSchema::try_from(schema.clone()).unwrap(); +/// # let props = ExecutionProps::new(); +/// // Given a PhysicalExpr, for `a = 1` we can evaluate it against a RecordBatch like this: +/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); +/// // Input of [1,2,3] +/// let input_batch = RecordBatch::try_from_iter(vec![ +/// ("a", Arc::new(Int32Array::from(vec![1, 2, 3])) as _) +/// ]).unwrap(); +/// // The result is a ColumnarValue (either an Array or a Scalar) +/// let result = physical_expr.evaluate(&input_batch).unwrap(); +/// // In this case, a BooleanArray with the result of the comparison +/// let ColumnarValue::Array(arr) = result else { +/// panic!("Expected an array") +/// }; +/// assert_eq!(arr.as_boolean(), &BooleanArray::from(vec![true, false, false])); +/// ``` +pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq { + /// Returns the physical expression as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + /// Get the data type of this expression, given the schema of the input + fn data_type(&self, input_schema: &Schema) -> Result; + /// Determine whether this expression is nullable, given the schema of the input + fn nullable(&self, input_schema: &Schema) -> Result; + /// Evaluate an expression against a RecordBatch + fn evaluate(&self, batch: &RecordBatch) -> Result; + /// Evaluate an expression against a RecordBatch after first applying a + /// validity array + fn evaluate_selection( + &self, + batch: &RecordBatch, + selection: &BooleanArray, + ) -> Result { + let tmp_batch = filter_record_batch(batch, selection)?; + + let tmp_result = self.evaluate(&tmp_batch)?; + + if batch.num_rows() == tmp_batch.num_rows() { + // All values from the `selection` filter are true. + Ok(tmp_result) + } else if let ColumnarValue::Array(a) = tmp_result { + scatter(selection, a.as_ref()).map(ColumnarValue::Array) + } else { + Ok(tmp_result) + } + } + + /// Get a list of child PhysicalExpr that provide the input for this expr. + fn children(&self) -> Vec>; + + /// Returns a new PhysicalExpr where all children were replaced by new exprs. + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result>; + + /// Computes the output interval for the expression, given the input + /// intervals. + /// + /// # Arguments + /// + /// * `children` are the intervals for the children (inputs) of this + /// expression. + /// + /// # Example + /// + /// If the expression is `a + b`, and the input intervals are `a: [1, 2]` + /// and `b: [3, 4]`, then the output interval would be `[4, 6]`. + fn evaluate_bounds(&self, _children: &[&Interval]) -> Result { + not_impl_err!("Not implemented for {self}") + } + + /// Updates bounds for child expressions, given a known interval for this + /// expression. + /// + /// This is used to propagate constraints down through an expression tree. + /// + /// # Arguments + /// + /// * `interval` is the currently known interval for this expression. + /// * `children` are the current intervals for the children of this expression. + /// + /// # Returns + /// + /// A `Vec` of new intervals for the children, in order. + /// + /// If constraint propagation reveals an infeasibility for any child, returns + /// [`None`]. If none of the children intervals change as a result of propagation, + /// may return an empty vector instead of cloning `children`. This is the default + /// (and conservative) return value. + /// + /// # Example + /// + /// If the expression is `a + b`, the current `interval` is `[4, 5]` and the + /// inputs `a` and `b` are respectively given as `[0, 2]` and `[-∞, 4]`, then + /// propagation would would return `[0, 2]` and `[2, 4]` as `b` must be at + /// least `2` to make the output at least `4`. + fn propagate_constraints( + &self, + _interval: &Interval, + _children: &[&Interval], + ) -> Result>> { + Ok(Some(vec![])) + } + + /// Update the hash `state` with this expression requirements from + /// [`Hash`]. + /// + /// This method is required to support hashing [`PhysicalExpr`]s. To + /// implement it, typically the type implementing + /// [`PhysicalExpr`] implements [`Hash`] and + /// then the following boiler plate is used: + /// + /// # Example: + /// ``` + /// // User defined expression that derives Hash + /// #[derive(Hash, Debug, PartialEq, Eq)] + /// struct MyExpr { + /// val: u64 + /// } + /// + /// // impl PhysicalExpr { + /// // ... + /// # impl MyExpr { + /// // Boiler plate to call the derived Hash impl + /// fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) { + /// use std::hash::Hash; + /// let mut s = state; + /// self.hash(&mut s); + /// } + /// // } + /// # } + /// ``` + /// Note: [`PhysicalExpr`] is not constrained by [`Hash`] + /// directly because it must remain object safe. + fn dyn_hash(&self, _state: &mut dyn Hasher); + + /// The order information of a PhysicalExpr can be estimated from its children. + /// This is especially helpful for projection expressions. If we can ensure that the + /// order of a PhysicalExpr to project matches with the order of SortExec, we can + /// eliminate that SortExecs. + /// + /// By recursively calling this function, we can obtain the overall order + /// information of the PhysicalExpr. Since `SortOptions` cannot fully handle + /// the propagation of unordered columns and literals, the `SortProperties` + /// struct is used. + fn get_ordering(&self, _children: &[SortProperties]) -> SortProperties { + SortProperties::Unordered + } +} + +impl Hash for dyn PhysicalExpr { + fn hash(&self, state: &mut H) { + self.dyn_hash(state); + } +} + +/// Returns a copy of this expr if we change any child according to the pointer comparison. +/// The size of `children` must be equal to the size of `PhysicalExpr::children()`. +pub fn with_new_children_if_necessary( + expr: Arc, + children: Vec>, +) -> Result> { + let old_children = expr.children(); + if children.len() != old_children.len() { + internal_err!("PhysicalExpr: Wrong number of children") + } else if children.is_empty() + || children + .iter() + .zip(old_children.iter()) + .any(|(c1, c2)| !Arc::data_ptr_eq(c1, c2)) + { + Ok(expr.with_new_children(children)?) + } else { + Ok(expr) + } +} diff --git a/datafusion/physical-expr-core/src/sort_expr.rs b/datafusion/physical-expr-core/src/sort_expr.rs new file mode 100644 index 000000000000..b248758bc120 --- /dev/null +++ b/datafusion/physical-expr-core/src/sort_expr.rs @@ -0,0 +1,16 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. diff --git a/datafusion/physical-expr/src/sort_properties.rs b/datafusion/physical-expr-core/src/sort_properties.rs similarity index 100% rename from datafusion/physical-expr/src/sort_properties.rs rename to datafusion/physical-expr-core/src/sort_properties.rs diff --git a/datafusion/physical-expr/src/tree_node.rs b/datafusion/physical-expr-core/src/tree_node.rs similarity index 100% rename from datafusion/physical-expr/src/tree_node.rs rename to datafusion/physical-expr-core/src/tree_node.rs diff --git a/datafusion/physical-expr-core/src/utils.rs b/datafusion/physical-expr-core/src/utils.rs new file mode 100644 index 000000000000..612c43f4a794 --- /dev/null +++ b/datafusion/physical-expr-core/src/utils.rs @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::{ + array::MutableArrayData, + compute::{and_kleene, is_not_null, SlicesIterator}, +}; +use arrow_array::{make_array, Array, ArrayRef, BooleanArray}; +use datafusion_common::Result; + +/// Scatter `truthy` array by boolean mask. When the mask evaluates `true`, next values of `truthy` +/// are taken, when the mask evaluates `false` values null values are filled. +/// +/// # Arguments +/// * `mask` - Boolean values used to determine where to put the `truthy` values +/// * `truthy` - All values of this array are to scatter according to `mask` into final result. +pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result { + let truthy = truthy.to_data(); + + // update the mask so that any null values become false + // (SlicesIterator doesn't respect nulls) + let mask = and_kleene(mask, &is_not_null(mask)?)?; + + let mut mutable = MutableArrayData::new(vec![&truthy], true, mask.len()); + + // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to + // fill with falsy values + + // keep track of how much is filled + let mut filled = 0; + // keep track of current position we have in truthy array + let mut true_pos = 0; + + SlicesIterator::new(&mask).for_each(|(start, end)| { + // the gap needs to be filled with nulls + if start > filled { + mutable.extend_nulls(start - filled); + } + // fill with truthy values + let len = end - start; + mutable.extend(0, true_pos, true_pos + len); + true_pos += len; + filled = end; + }); + // the remaining part is falsy + if filled < mask.len() { + mutable.extend_nulls(mask.len() - filled); + } + + let data = mutable.freeze(); + Ok(make_array(data)) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::Int32Array; + use datafusion_common::cast::{as_boolean_array, as_int32_array}; + + use super::*; + + #[test] + fn scatter_int() -> Result<()> { + let truthy = Arc::new(Int32Array::from(vec![1, 10, 11, 100])); + let mask = BooleanArray::from(vec![true, true, false, false, true]); + + // the output array is expected to be the same length as the mask array + let expected = + Int32Array::from_iter(vec![Some(1), Some(10), None, None, Some(11)]); + let result = scatter(&mask, truthy.as_ref())?; + let result = as_int32_array(&result)?; + + assert_eq!(&expected, result); + Ok(()) + } + + #[test] + fn scatter_int_end_with_false() -> Result<()> { + let truthy = Arc::new(Int32Array::from(vec![1, 10, 11, 100])); + let mask = BooleanArray::from(vec![true, false, true, false, false, false]); + + // output should be same length as mask + let expected = + Int32Array::from_iter(vec![Some(1), None, Some(10), None, None, None]); + let result = scatter(&mask, truthy.as_ref())?; + let result = as_int32_array(&result)?; + + assert_eq!(&expected, result); + Ok(()) + } + + #[test] + fn scatter_with_null_mask() -> Result<()> { + let truthy = Arc::new(Int32Array::from(vec![1, 10, 11])); + let mask: BooleanArray = vec![Some(false), None, Some(true), Some(true), None] + .into_iter() + .collect(); + + // output should treat nulls as though they are false + let expected = Int32Array::from_iter(vec![None, None, Some(1), Some(10), None]); + let result = scatter(&mask, truthy.as_ref())?; + let result = as_int32_array(&result)?; + + assert_eq!(&expected, result); + Ok(()) + } + + #[test] + fn scatter_boolean() -> Result<()> { + let truthy = Arc::new(BooleanArray::from(vec![false, false, false, true])); + let mask = BooleanArray::from(vec![true, true, false, false, true]); + + // the output array is expected to be the same length as the mask array + let expected = BooleanArray::from_iter(vec![ + Some(false), + Some(false), + None, + None, + Some(false), + ]); + let result = scatter(&mask, truthy.as_ref())?; + let result = as_boolean_array(&result)?; + + assert_eq!(&expected, result); + Ok(()) + } +} diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index baca00bea724..ec802ae10bc2 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -59,6 +59,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-physical-expr-core = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", optional = true } diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 655771270a6b..f6a9d3083ce3 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -29,9 +29,7 @@ mod physical_expr; pub mod planner; mod scalar_function; mod sort_expr; -pub mod sort_properties; pub mod string_expressions; -pub mod tree_node; pub mod udf; pub mod utils; pub mod window; @@ -49,7 +47,7 @@ pub use equivalence::EquivalenceProperties; pub use partitioning::{Distribution, Partitioning}; pub use physical_expr::{ physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, - PhysicalExpr, PhysicalExprRef, + PhysicalExprRef, }; pub use planner::{create_physical_expr, create_physical_exprs}; pub use scalar_function::ScalarFunctionExpr; @@ -60,3 +58,18 @@ pub use sort_expr::{ pub use utils::{reverse_order_bys, split_conjunction}; pub use aggregate::first_last::create_first_value_accumulator; + +// For backwards compatibility +pub mod sort_properties { + pub use datafusion_physical_expr_core::sort_properties::{ + ExprOrdering, SortProperties, + }; +} + +// For backwards compatibility +pub mod tree_node { + pub use datafusion_physical_expr_core::tree_node::ExprContext; +} + +// For backwards compatibility +pub use datafusion_physical_expr_core::physical_expr::PhysicalExpr; diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index 861a4ad02801..c9e67acaf8ff 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -16,248 +16,14 @@ // under the License. use std::any::Any; -use std::fmt::{Debug, Display}; -use std::hash::{Hash, Hasher}; use std::sync::Arc; -use crate::sort_properties::SortProperties; -use crate::utils::scatter; - -use arrow::array::BooleanArray; -use arrow::compute::filter_record_batch; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; -use datafusion_common::utils::DataPtr; -use datafusion_common::{internal_err, not_impl_err, Result}; -use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::ColumnarValue; - +use datafusion_physical_expr_core::physical_expr::PhysicalExpr; use itertools::izip; -/// `PhysicalExpr` evaluate DataFusion expressions such as `A + 1`, or `CAST(c1 -/// AS int)`. -/// -/// `PhysicalExpr` are the physical counterpart to [`Expr`] used in logical -/// planning, and can be evaluated directly on a [`RecordBatch`]. They are -/// normally created from `Expr` by a [`PhysicalPlanner`] and can be created -/// directly using [`create_physical_expr`]. -/// -/// A Physical expression knows its type, nullability and how to evaluate itself. -/// -/// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html -/// [`create_physical_expr`]: crate::create_physical_expr -/// [`Expr`]: datafusion_expr::Expr -/// -/// # Example: Create `PhysicalExpr` from `Expr` -/// ``` -/// # use arrow_schema::{DataType, Field, Schema}; -/// # use datafusion_common::DFSchema; -/// # use datafusion_expr::{Expr, col, lit}; -/// # use datafusion_physical_expr::create_physical_expr; -/// # use datafusion_expr::execution_props::ExecutionProps; -/// // For a logical expression `a = 1`, we can create a physical expression -/// let expr = col("a").eq(lit(1)); -/// // To create a PhysicalExpr we need 1. a schema -/// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); -/// let df_schema = DFSchema::try_from(schema).unwrap(); -/// // 2. ExecutionProps -/// let props = ExecutionProps::new(); -/// // We can now create a PhysicalExpr: -/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); -/// ``` -/// -/// # Example: Executing a PhysicalExpr to obtain [`ColumnarValue`] -/// ``` -/// # use std::sync::Arc; -/// # use arrow_array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; -/// # use arrow_schema::{DataType, Field, Schema}; -/// # use datafusion_common::{assert_batches_eq, DFSchema}; -/// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; -/// # use datafusion_physical_expr::create_physical_expr; -/// # use datafusion_expr::execution_props::ExecutionProps; -/// # let expr = col("a").eq(lit(1)); -/// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); -/// # let df_schema = DFSchema::try_from(schema.clone()).unwrap(); -/// # let props = ExecutionProps::new(); -/// // Given a PhysicalExpr, for `a = 1` we can evaluate it against a RecordBatch like this: -/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); -/// // Input of [1,2,3] -/// let input_batch = RecordBatch::try_from_iter(vec![ -/// ("a", Arc::new(Int32Array::from(vec![1, 2, 3])) as _) -/// ]).unwrap(); -/// // The result is a ColumnarValue (either an Array or a Scalar) -/// let result = physical_expr.evaluate(&input_batch).unwrap(); -/// // In this case, a BooleanArray with the result of the comparison -/// let ColumnarValue::Array(arr) = result else { -/// panic!("Expected an array") -/// }; -/// assert_eq!(arr.as_boolean(), &BooleanArray::from(vec![true, false, false])); -/// ``` -pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq { - /// Returns the physical expression as [`Any`] so that it can be - /// downcast to a specific implementation. - fn as_any(&self) -> &dyn Any; - /// Get the data type of this expression, given the schema of the input - fn data_type(&self, input_schema: &Schema) -> Result; - /// Determine whether this expression is nullable, given the schema of the input - fn nullable(&self, input_schema: &Schema) -> Result; - /// Evaluate an expression against a RecordBatch - fn evaluate(&self, batch: &RecordBatch) -> Result; - /// Evaluate an expression against a RecordBatch after first applying a - /// validity array - fn evaluate_selection( - &self, - batch: &RecordBatch, - selection: &BooleanArray, - ) -> Result { - let tmp_batch = filter_record_batch(batch, selection)?; - - let tmp_result = self.evaluate(&tmp_batch)?; - - if batch.num_rows() == tmp_batch.num_rows() { - // All values from the `selection` filter are true. - Ok(tmp_result) - } else if let ColumnarValue::Array(a) = tmp_result { - scatter(selection, a.as_ref()).map(ColumnarValue::Array) - } else { - Ok(tmp_result) - } - } - - /// Get a list of child PhysicalExpr that provide the input for this expr. - fn children(&self) -> Vec>; - - /// Returns a new PhysicalExpr where all children were replaced by new exprs. - fn with_new_children( - self: Arc, - children: Vec>, - ) -> Result>; - - /// Computes the output interval for the expression, given the input - /// intervals. - /// - /// # Arguments - /// - /// * `children` are the intervals for the children (inputs) of this - /// expression. - /// - /// # Example - /// - /// If the expression is `a + b`, and the input intervals are `a: [1, 2]` - /// and `b: [3, 4]`, then the output interval would be `[4, 6]`. - fn evaluate_bounds(&self, _children: &[&Interval]) -> Result { - not_impl_err!("Not implemented for {self}") - } - - /// Updates bounds for child expressions, given a known interval for this - /// expression. - /// - /// This is used to propagate constraints down through an expression tree. - /// - /// # Arguments - /// - /// * `interval` is the currently known interval for this expression. - /// * `children` are the current intervals for the children of this expression. - /// - /// # Returns - /// - /// A `Vec` of new intervals for the children, in order. - /// - /// If constraint propagation reveals an infeasibility for any child, returns - /// [`None`]. If none of the children intervals change as a result of propagation, - /// may return an empty vector instead of cloning `children`. This is the default - /// (and conservative) return value. - /// - /// # Example - /// - /// If the expression is `a + b`, the current `interval` is `[4, 5]` and the - /// inputs `a` and `b` are respectively given as `[0, 2]` and `[-∞, 4]`, then - /// propagation would would return `[0, 2]` and `[2, 4]` as `b` must be at - /// least `2` to make the output at least `4`. - fn propagate_constraints( - &self, - _interval: &Interval, - _children: &[&Interval], - ) -> Result>> { - Ok(Some(vec![])) - } - - /// Update the hash `state` with this expression requirements from - /// [`Hash`]. - /// - /// This method is required to support hashing [`PhysicalExpr`]s. To - /// implement it, typically the type implementing - /// [`PhysicalExpr`] implements [`Hash`] and - /// then the following boiler plate is used: - /// - /// # Example: - /// ``` - /// // User defined expression that derives Hash - /// #[derive(Hash, Debug, PartialEq, Eq)] - /// struct MyExpr { - /// val: u64 - /// } - /// - /// // impl PhysicalExpr { - /// // ... - /// # impl MyExpr { - /// // Boiler plate to call the derived Hash impl - /// fn dyn_hash(&self, state: &mut dyn std::hash::Hasher) { - /// use std::hash::Hash; - /// let mut s = state; - /// self.hash(&mut s); - /// } - /// // } - /// # } - /// ``` - /// Note: [`PhysicalExpr`] is not constrained by [`Hash`] - /// directly because it must remain object safe. - fn dyn_hash(&self, _state: &mut dyn Hasher); - - /// The order information of a PhysicalExpr can be estimated from its children. - /// This is especially helpful for projection expressions. If we can ensure that the - /// order of a PhysicalExpr to project matches with the order of SortExec, we can - /// eliminate that SortExecs. - /// - /// By recursively calling this function, we can obtain the overall order - /// information of the PhysicalExpr. Since `SortOptions` cannot fully handle - /// the propagation of unordered columns and literals, the `SortProperties` - /// struct is used. - fn get_ordering(&self, _children: &[SortProperties]) -> SortProperties { - SortProperties::Unordered - } -} - -impl Hash for dyn PhysicalExpr { - fn hash(&self, state: &mut H) { - self.dyn_hash(state); - } -} - /// Shared [`PhysicalExpr`]. pub type PhysicalExprRef = Arc; -/// Returns a copy of this expr if we change any child according to the pointer comparison. -/// The size of `children` must be equal to the size of `PhysicalExpr::children()`. -pub fn with_new_children_if_necessary( - expr: Arc, - children: Vec>, -) -> Result> { - let old_children = expr.children(); - if children.len() != old_children.len() { - internal_err!("PhysicalExpr: Wrong number of children") - } else if children.is_empty() - || children - .iter() - .zip(old_children.iter()) - .any(|(c1, c2)| !Arc::data_ptr_eq(c1, c2)) - { - Ok(expr.with_new_children(children)?) - } else { - Ok(expr) - } -} - pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { if any.is::>() { any.downcast_ref::>() diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs index b8e99403d695..0aea05556697 100644 --- a/datafusion/physical-expr/src/utils/mod.rs +++ b/datafusion/physical-expr/src/utils/mod.rs @@ -24,10 +24,9 @@ use std::sync::Arc; use crate::expressions::{BinaryExpr, Column}; use crate::tree_node::ExprContext; -use crate::{PhysicalExpr, PhysicalSortExpr}; +use crate::PhysicalExpr; +use crate::PhysicalSortExpr; -use arrow::array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData}; -use arrow::compute::{and_kleene, is_not_null, SlicesIterator}; use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, @@ -257,49 +256,6 @@ pub fn reverse_order_bys(order_bys: &[PhysicalSortExpr]) -> Vec Result { - let truthy = truthy.to_data(); - - // update the mask so that any null values become false - // (SlicesIterator doesn't respect nulls) - let mask = and_kleene(mask, &is_not_null(mask)?)?; - - let mut mutable = MutableArrayData::new(vec![&truthy], true, mask.len()); - - // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to - // fill with falsy values - - // keep track of how much is filled - let mut filled = 0; - // keep track of current position we have in truthy array - let mut true_pos = 0; - - SlicesIterator::new(&mask).for_each(|(start, end)| { - // the gap needs to be filled with nulls - if start > filled { - mutable.extend_nulls(start - filled); - } - // fill with truthy values - let len = end - start; - mutable.extend(0, true_pos, true_pos + len); - true_pos += len; - filled = end; - }); - // the remaining part is falsy - if filled < mask.len() { - mutable.extend_nulls(mask.len() - filled); - } - - let data = mutable.freeze(); - Ok(make_array(data)) -} - /// Merge left and right sort expressions, checking for duplicates. pub fn merge_vectors( left: &[PhysicalSortExpr], @@ -321,9 +277,7 @@ mod tests { use crate::expressions::{binary, cast, col, in_list, lit, Column, Literal}; use crate::PhysicalSortExpr; - use arrow_array::Int32Array; use arrow_schema::{DataType, Field, Schema}; - use datafusion_common::cast::{as_boolean_array, as_int32_array}; use datafusion_common::{Result, ScalarValue}; use petgraph::visit::Bfs; @@ -517,70 +471,4 @@ mod tests { assert_eq!(collect_columns(&expr3), expected); Ok(()) } - - #[test] - fn scatter_int() -> Result<()> { - let truthy = Arc::new(Int32Array::from(vec![1, 10, 11, 100])); - let mask = BooleanArray::from(vec![true, true, false, false, true]); - - // the output array is expected to be the same length as the mask array - let expected = - Int32Array::from_iter(vec![Some(1), Some(10), None, None, Some(11)]); - let result = scatter(&mask, truthy.as_ref())?; - let result = as_int32_array(&result)?; - - assert_eq!(&expected, result); - Ok(()) - } - - #[test] - fn scatter_int_end_with_false() -> Result<()> { - let truthy = Arc::new(Int32Array::from(vec![1, 10, 11, 100])); - let mask = BooleanArray::from(vec![true, false, true, false, false, false]); - - // output should be same length as mask - let expected = - Int32Array::from_iter(vec![Some(1), None, Some(10), None, None, None]); - let result = scatter(&mask, truthy.as_ref())?; - let result = as_int32_array(&result)?; - - assert_eq!(&expected, result); - Ok(()) - } - - #[test] - fn scatter_with_null_mask() -> Result<()> { - let truthy = Arc::new(Int32Array::from(vec![1, 10, 11])); - let mask: BooleanArray = vec![Some(false), None, Some(true), Some(true), None] - .into_iter() - .collect(); - - // output should treat nulls as though they are false - let expected = Int32Array::from_iter(vec![None, None, Some(1), Some(10), None]); - let result = scatter(&mask, truthy.as_ref())?; - let result = as_int32_array(&result)?; - - assert_eq!(&expected, result); - Ok(()) - } - - #[test] - fn scatter_boolean() -> Result<()> { - let truthy = Arc::new(BooleanArray::from(vec![false, false, false, true])); - let mask = BooleanArray::from(vec![true, true, false, false, true]); - - // the output array is expected to be the same length as the mask array - let expected = BooleanArray::from_iter(vec![ - Some(false), - Some(false), - None, - None, - Some(false), - ]); - let result = scatter(&mask, truthy.as_ref())?; - let result = as_boolean_array(&result)?; - - assert_eq!(&expected, result); - Ok(()) - } } From 450ae4b8427001c8ff2fc5bd8a643b70e52b290b Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 3 Apr 2024 21:54:35 +0800 Subject: [PATCH 03/38] cleanup Signed-off-by: jayzhan211 --- datafusion/physical-expr-core/src/lib.rs | 8 ++++---- datafusion/physical-expr-core/src/physical_expr.rs | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-expr-core/src/lib.rs b/datafusion/physical-expr-core/src/lib.rs index 6d22f9818818..8a40615c407d 100644 --- a/datafusion/physical-expr-core/src/lib.rs +++ b/datafusion/physical-expr-core/src/lib.rs @@ -23,10 +23,10 @@ pub mod utils; mod sort_expr; // backwards compatibility -pub mod execution_props { - pub use datafusion_expr::execution_props::ExecutionProps; - pub use datafusion_expr::var_provider::{VarProvider, VarType}; -} +// pub mod execution_props { +// pub use datafusion_expr::execution_props::ExecutionProps; +// pub use datafusion_expr::var_provider::{VarProvider, VarType}; +// } // pub use physical_expr::{ // physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, diff --git a/datafusion/physical-expr-core/src/physical_expr.rs b/datafusion/physical-expr-core/src/physical_expr.rs index b6a1153c1cda..67516034af44 100644 --- a/datafusion/physical-expr-core/src/physical_expr.rs +++ b/datafusion/physical-expr-core/src/physical_expr.rs @@ -28,7 +28,6 @@ use datafusion_common::utils::DataPtr; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::ColumnarValue; -use itertools::izip; use crate::sort_properties::SortProperties; use crate::utils::scatter; From 362496461d27087ae4d791a51128df14f4a40892 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 3 Apr 2024 22:09:32 +0800 Subject: [PATCH 04/38] move physical sort Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 26 ++ datafusion/physical-expr-core/Cargo.toml | 14 +- datafusion/physical-expr-core/src/lib.rs | 20 +- .../physical-expr-core/src/sort_expr.rs | 253 ++++++++++++++++ datafusion/physical-expr/src/lib.rs | 14 +- datafusion/physical-expr/src/sort_expr.rs | 269 ------------------ 6 files changed, 289 insertions(+), 307 deletions(-) delete mode 100644 datafusion/physical-expr/src/sort_expr.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 3be92221d3ee..b97981889e6e 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1331,6 +1331,7 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr-core", "half", "hashbrown 0.14.3", "hex", @@ -1345,6 +1346,31 @@ dependencies = [ "sha2", ] +[[package]] +name = "datafusion-physical-expr-core" +version = "37.0.0" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "arrow-string", + "chrono", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "half", + "hashbrown 0.14.3", + "indexmap 2.2.6", + "itertools", + "log", + "paste", + "petgraph", + "rand", +] + [[package]] name = "datafusion-physical-plan" version = "37.0.0" diff --git a/datafusion/physical-expr-core/Cargo.toml b/datafusion/physical-expr-core/Cargo.toml index b67bf20b303c..5e6badc1c846 100644 --- a/datafusion/physical-expr-core/Cargo.toml +++ b/datafusion/physical-expr-core/Cargo.toml @@ -32,18 +32,6 @@ rust-version = { workspace = true } name = "datafusion_physical_expr_core" path = "src/lib.rs" -# TODO: Remove unused features - -[features] -crypto_expressions = ["md-5", "sha2", "blake2", "blake3"] -default = [ - "crypto_expressions", - "regex_expressions", - "encoding_expressions", -] -encoding_expressions = ["base64", "hex"] -regex_expressions = ["regex"] - [dependencies] ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", @@ -78,4 +66,4 @@ sha2 = { version = "^0.10.1", optional = true } criterion = "0.5" rand = { workspace = true } rstest = { workspace = true } -tokio = { workspace = true, features = ["rt-multi-thread"] } \ No newline at end of file +tokio = { workspace = true, features = ["rt-multi-thread"] } diff --git a/datafusion/physical-expr-core/src/lib.rs b/datafusion/physical-expr-core/src/lib.rs index 8a40615c407d..9ccc35a351b8 100644 --- a/datafusion/physical-expr-core/src/lib.rs +++ b/datafusion/physical-expr-core/src/lib.rs @@ -16,23 +16,7 @@ // under the License. pub mod physical_expr; +pub mod sort_expr; pub mod sort_properties; pub mod tree_node; -pub mod utils; - -mod sort_expr; - -// backwards compatibility -// pub mod execution_props { -// pub use datafusion_expr::execution_props::ExecutionProps; -// pub use datafusion_expr::var_provider::{VarProvider, VarType}; -// } - -// pub use physical_expr::{ -// physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, -// PhysicalExpr, PhysicalExprRef, -// }; -// pub use sort_expr::{ -// LexOrdering, LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalSortExpr, -// PhysicalSortRequirement, -// }; +pub mod utils; \ No newline at end of file diff --git a/datafusion/physical-expr-core/src/sort_expr.rs b/datafusion/physical-expr-core/src/sort_expr.rs index b248758bc120..58ddb4303d2e 100644 --- a/datafusion/physical-expr-core/src/sort_expr.rs +++ b/datafusion/physical-expr-core/src/sort_expr.rs @@ -14,3 +14,256 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. + +//! Sort expressions + +use std::fmt::Display; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; + +use arrow::compute::kernels::sort::{SortColumn, SortOptions}; +use arrow::record_batch::RecordBatch; +use arrow_schema::Schema; +use datafusion_common::Result; +use datafusion_expr::ColumnarValue; + +use crate::physical_expr::PhysicalExpr; + +/// Represents Sort operation for a column in a RecordBatch +#[derive(Clone, Debug)] +pub struct PhysicalSortExpr { + /// Physical expression representing the column to sort + pub expr: Arc, + /// Option to specify how the given column should be sorted + pub options: SortOptions, +} + +impl PartialEq for PhysicalSortExpr { + fn eq(&self, other: &PhysicalSortExpr) -> bool { + self.options == other.options && self.expr.eq(&other.expr) + } +} + +impl Eq for PhysicalSortExpr {} + +impl Hash for PhysicalSortExpr { + fn hash(&self, state: &mut H) { + self.expr.hash(state); + self.options.hash(state); + } +} + +impl std::fmt::Display for PhysicalSortExpr { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{} {}", self.expr, to_str(&self.options)) + } +} + +impl PhysicalSortExpr { + /// evaluate the sort expression into SortColumn that can be passed into arrow sort kernel + pub fn evaluate_to_sort_column(&self, batch: &RecordBatch) -> Result { + let value_to_sort = self.expr.evaluate(batch)?; + let array_to_sort = match value_to_sort { + ColumnarValue::Array(array) => array, + ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(batch.num_rows())?, + }; + Ok(SortColumn { + values: array_to_sort, + options: Some(self.options), + }) + } + + /// Checks whether this sort expression satisfies the given `requirement`. + /// If sort options are unspecified in `requirement`, only expressions are + /// compared for inequality. + pub fn satisfy( + &self, + requirement: &PhysicalSortRequirement, + schema: &Schema, + ) -> bool { + // If the column is not nullable, NULLS FIRST/LAST is not important. + let nullable = self.expr.nullable(schema).unwrap_or(true); + self.expr.eq(&requirement.expr) + && if nullable { + requirement + .options + .map_or(true, |opts| self.options == opts) + } else { + requirement + .options + .map_or(true, |opts| self.options.descending == opts.descending) + } + } + + /// Returns a [`Display`]able list of `PhysicalSortExpr`. + pub fn format_list(input: &[PhysicalSortExpr]) -> impl Display + '_ { + struct DisplayableList<'a>(&'a [PhysicalSortExpr]); + impl<'a> Display for DisplayableList<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut first = true; + for sort_expr in self.0 { + if first { + first = false; + } else { + write!(f, ",")?; + } + write!(f, "{}", sort_expr)?; + } + Ok(()) + } + } + DisplayableList(input) + } +} + +/// Represents sort requirement associated with a plan +/// +/// If the requirement includes [`SortOptions`] then both the +/// expression *and* the sort options must match. +/// +/// If the requirement does not include [`SortOptions`]) then only the +/// expressions must match. +/// +/// # Examples +/// +/// With sort options (`A`, `DESC NULLS FIRST`): +/// * `ORDER BY A DESC NULLS FIRST` matches +/// * `ORDER BY A ASC NULLS FIRST` does not match (`ASC` vs `DESC`) +/// * `ORDER BY B DESC NULLS FIRST` does not match (different expr) +/// +/// Without sort options (`A`, None): +/// * `ORDER BY A DESC NULLS FIRST` matches +/// * `ORDER BY A ASC NULLS FIRST` matches (`ASC` and `NULL` options ignored) +/// * `ORDER BY B DESC NULLS FIRST` does not match (different expr) +#[derive(Clone, Debug)] +pub struct PhysicalSortRequirement { + /// Physical expression representing the column to sort + pub expr: Arc, + /// Option to specify how the given column should be sorted. + /// If unspecified, there are no constraints on sort options. + pub options: Option, +} + +impl From for PhysicalSortExpr { + /// If options is `None`, the default sort options `ASC, NULLS LAST` is used. + /// + /// The default is picked to be consistent with + /// PostgreSQL: + fn from(value: PhysicalSortRequirement) -> Self { + let options = value.options.unwrap_or(SortOptions { + descending: false, + nulls_first: false, + }); + PhysicalSortExpr { + expr: value.expr, + options, + } + } +} + +impl From for PhysicalSortRequirement { + fn from(value: PhysicalSortExpr) -> Self { + PhysicalSortRequirement::new(value.expr, Some(value.options)) + } +} + +impl PartialEq for PhysicalSortRequirement { + fn eq(&self, other: &PhysicalSortRequirement) -> bool { + self.options == other.options && self.expr.eq(&other.expr) + } +} + +impl std::fmt::Display for PhysicalSortRequirement { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let opts_string = self.options.as_ref().map_or("NA", to_str); + write!(f, "{} {}", self.expr, opts_string) + } +} + +impl PhysicalSortRequirement { + /// Creates a new requirement. + /// + /// If `options` is `Some(..)`, creates an `exact` requirement, + /// which must match both `options` and `expr`. + /// + /// If `options` is `None`, Creates a new `expr_only` requirement, + /// which must match only `expr`. + /// + /// See [`PhysicalSortRequirement`] for examples. + pub fn new(expr: Arc, options: Option) -> Self { + Self { expr, options } + } + + /// Replace the required expression for this requirement with the new one + pub fn with_expr(mut self, expr: Arc) -> Self { + self.expr = expr; + self + } + + /// Returns whether this requirement is equal or more specific than `other`. + pub fn compatible(&self, other: &PhysicalSortRequirement) -> bool { + self.expr.eq(&other.expr) + && other.options.map_or(true, |other_opts| { + self.options.map_or(false, |opts| opts == other_opts) + }) + } + + /// Returns [`PhysicalSortRequirement`] that requires the exact + /// sort of the [`PhysicalSortExpr`]s in `ordering` + /// + /// This method takes `&'a PhysicalSortExpr` to make it easy to + /// use implementing [`ExecutionPlan::required_input_ordering`]. + /// + /// [`ExecutionPlan::required_input_ordering`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#method.required_input_ordering + pub fn from_sort_exprs<'a>( + ordering: impl IntoIterator, + ) -> Vec { + ordering + .into_iter() + .cloned() + .map(PhysicalSortRequirement::from) + .collect() + } + + /// Converts an iterator of [`PhysicalSortRequirement`] into a Vec + /// of [`PhysicalSortExpr`]s. + /// + /// This function converts `PhysicalSortRequirement` to `PhysicalSortExpr` + /// for each entry in the input. If required ordering is None for an entry + /// default ordering `ASC, NULLS LAST` if given (see the `PhysicalSortExpr::from`). + pub fn to_sort_exprs( + requirements: impl IntoIterator, + ) -> Vec { + requirements + .into_iter() + .map(PhysicalSortExpr::from) + .collect() + } +} + +/// Returns the SQL string representation of the given [SortOptions] object. +#[inline] +fn to_str(options: &SortOptions) -> &str { + match (options.descending, options.nulls_first) { + (true, true) => "DESC", + (true, false) => "DESC NULLS LAST", + (false, true) => "ASC", + (false, false) => "ASC NULLS LAST", + } +} + +///`LexOrdering` is an alias for the type `Vec`, which represents +/// a lexicographical ordering. +pub type LexOrdering = Vec; + +///`LexOrderingRef` is an alias for the type &`[PhysicalSortExpr]`, which represents +/// a reference to a lexicographical ordering. +pub type LexOrderingRef<'a> = &'a [PhysicalSortExpr]; + +///`LexRequirement` is an alias for the type `Vec`, which +/// represents a lexicographical ordering requirement. +pub type LexRequirement = Vec; + +///`LexRequirementRef` is an alias for the type &`[PhysicalSortRequirement]`, which +/// represents a reference to a lexicographical ordering requirement. +pub type LexRequirementRef<'a> = &'a [PhysicalSortRequirement]; diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index f6a9d3083ce3..7818f8ba7adf 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -28,7 +28,6 @@ mod partitioning; mod physical_expr; pub mod planner; mod scalar_function; -mod sort_expr; pub mod string_expressions; pub mod udf; pub mod utils; @@ -49,12 +48,16 @@ pub use physical_expr::{ physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal, PhysicalExprRef, }; -pub use planner::{create_physical_expr, create_physical_exprs}; -pub use scalar_function::ScalarFunctionExpr; -pub use sort_expr::{ + +pub use datafusion_physical_expr_core::physical_expr::PhysicalExpr; +pub use datafusion_physical_expr_core::sort_expr::{ LexOrdering, LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalSortExpr, PhysicalSortRequirement, }; + +pub use planner::{create_physical_expr, create_physical_exprs}; +pub use scalar_function::ScalarFunctionExpr; + pub use utils::{reverse_order_bys, split_conjunction}; pub use aggregate::first_last::create_first_value_accumulator; @@ -70,6 +73,3 @@ pub mod sort_properties { pub mod tree_node { pub use datafusion_physical_expr_core::tree_node::ExprContext; } - -// For backwards compatibility -pub use datafusion_physical_expr_core::physical_expr::PhysicalExpr; diff --git a/datafusion/physical-expr/src/sort_expr.rs b/datafusion/physical-expr/src/sort_expr.rs deleted file mode 100644 index 914d76f9261a..000000000000 --- a/datafusion/physical-expr/src/sort_expr.rs +++ /dev/null @@ -1,269 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Sort expressions - -use std::fmt::Display; -use std::hash::{Hash, Hasher}; -use std::sync::Arc; - -use crate::PhysicalExpr; - -use arrow::compute::kernels::sort::{SortColumn, SortOptions}; -use arrow::record_batch::RecordBatch; -use arrow_schema::Schema; -use datafusion_common::Result; -use datafusion_expr::ColumnarValue; - -/// Represents Sort operation for a column in a RecordBatch -#[derive(Clone, Debug)] -pub struct PhysicalSortExpr { - /// Physical expression representing the column to sort - pub expr: Arc, - /// Option to specify how the given column should be sorted - pub options: SortOptions, -} - -impl PartialEq for PhysicalSortExpr { - fn eq(&self, other: &PhysicalSortExpr) -> bool { - self.options == other.options && self.expr.eq(&other.expr) - } -} - -impl Eq for PhysicalSortExpr {} - -impl Hash for PhysicalSortExpr { - fn hash(&self, state: &mut H) { - self.expr.hash(state); - self.options.hash(state); - } -} - -impl std::fmt::Display for PhysicalSortExpr { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{} {}", self.expr, to_str(&self.options)) - } -} - -impl PhysicalSortExpr { - /// evaluate the sort expression into SortColumn that can be passed into arrow sort kernel - pub fn evaluate_to_sort_column(&self, batch: &RecordBatch) -> Result { - let value_to_sort = self.expr.evaluate(batch)?; - let array_to_sort = match value_to_sort { - ColumnarValue::Array(array) => array, - ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(batch.num_rows())?, - }; - Ok(SortColumn { - values: array_to_sort, - options: Some(self.options), - }) - } - - /// Checks whether this sort expression satisfies the given `requirement`. - /// If sort options are unspecified in `requirement`, only expressions are - /// compared for inequality. - pub fn satisfy( - &self, - requirement: &PhysicalSortRequirement, - schema: &Schema, - ) -> bool { - // If the column is not nullable, NULLS FIRST/LAST is not important. - let nullable = self.expr.nullable(schema).unwrap_or(true); - self.expr.eq(&requirement.expr) - && if nullable { - requirement - .options - .map_or(true, |opts| self.options == opts) - } else { - requirement - .options - .map_or(true, |opts| self.options.descending == opts.descending) - } - } - - /// Returns a [`Display`]able list of `PhysicalSortExpr`. - pub fn format_list(input: &[PhysicalSortExpr]) -> impl Display + '_ { - struct DisplayableList<'a>(&'a [PhysicalSortExpr]); - impl<'a> Display for DisplayableList<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let mut first = true; - for sort_expr in self.0 { - if first { - first = false; - } else { - write!(f, ",")?; - } - write!(f, "{}", sort_expr)?; - } - Ok(()) - } - } - DisplayableList(input) - } -} - -/// Represents sort requirement associated with a plan -/// -/// If the requirement includes [`SortOptions`] then both the -/// expression *and* the sort options must match. -/// -/// If the requirement does not include [`SortOptions`]) then only the -/// expressions must match. -/// -/// # Examples -/// -/// With sort options (`A`, `DESC NULLS FIRST`): -/// * `ORDER BY A DESC NULLS FIRST` matches -/// * `ORDER BY A ASC NULLS FIRST` does not match (`ASC` vs `DESC`) -/// * `ORDER BY B DESC NULLS FIRST` does not match (different expr) -/// -/// Without sort options (`A`, None): -/// * `ORDER BY A DESC NULLS FIRST` matches -/// * `ORDER BY A ASC NULLS FIRST` matches (`ASC` and `NULL` options ignored) -/// * `ORDER BY B DESC NULLS FIRST` does not match (different expr) -#[derive(Clone, Debug)] -pub struct PhysicalSortRequirement { - /// Physical expression representing the column to sort - pub expr: Arc, - /// Option to specify how the given column should be sorted. - /// If unspecified, there are no constraints on sort options. - pub options: Option, -} - -impl From for PhysicalSortExpr { - /// If options is `None`, the default sort options `ASC, NULLS LAST` is used. - /// - /// The default is picked to be consistent with - /// PostgreSQL: - fn from(value: PhysicalSortRequirement) -> Self { - let options = value.options.unwrap_or(SortOptions { - descending: false, - nulls_first: false, - }); - PhysicalSortExpr { - expr: value.expr, - options, - } - } -} - -impl From for PhysicalSortRequirement { - fn from(value: PhysicalSortExpr) -> Self { - PhysicalSortRequirement::new(value.expr, Some(value.options)) - } -} - -impl PartialEq for PhysicalSortRequirement { - fn eq(&self, other: &PhysicalSortRequirement) -> bool { - self.options == other.options && self.expr.eq(&other.expr) - } -} - -impl std::fmt::Display for PhysicalSortRequirement { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let opts_string = self.options.as_ref().map_or("NA", to_str); - write!(f, "{} {}", self.expr, opts_string) - } -} - -impl PhysicalSortRequirement { - /// Creates a new requirement. - /// - /// If `options` is `Some(..)`, creates an `exact` requirement, - /// which must match both `options` and `expr`. - /// - /// If `options` is `None`, Creates a new `expr_only` requirement, - /// which must match only `expr`. - /// - /// See [`PhysicalSortRequirement`] for examples. - pub fn new(expr: Arc, options: Option) -> Self { - Self { expr, options } - } - - /// Replace the required expression for this requirement with the new one - pub fn with_expr(mut self, expr: Arc) -> Self { - self.expr = expr; - self - } - - /// Returns whether this requirement is equal or more specific than `other`. - pub fn compatible(&self, other: &PhysicalSortRequirement) -> bool { - self.expr.eq(&other.expr) - && other.options.map_or(true, |other_opts| { - self.options.map_or(false, |opts| opts == other_opts) - }) - } - - /// Returns [`PhysicalSortRequirement`] that requires the exact - /// sort of the [`PhysicalSortExpr`]s in `ordering` - /// - /// This method takes `&'a PhysicalSortExpr` to make it easy to - /// use implementing [`ExecutionPlan::required_input_ordering`]. - /// - /// [`ExecutionPlan::required_input_ordering`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#method.required_input_ordering - pub fn from_sort_exprs<'a>( - ordering: impl IntoIterator, - ) -> Vec { - ordering - .into_iter() - .cloned() - .map(PhysicalSortRequirement::from) - .collect() - } - - /// Converts an iterator of [`PhysicalSortRequirement`] into a Vec - /// of [`PhysicalSortExpr`]s. - /// - /// This function converts `PhysicalSortRequirement` to `PhysicalSortExpr` - /// for each entry in the input. If required ordering is None for an entry - /// default ordering `ASC, NULLS LAST` if given (see the `PhysicalSortExpr::from`). - pub fn to_sort_exprs( - requirements: impl IntoIterator, - ) -> Vec { - requirements - .into_iter() - .map(PhysicalSortExpr::from) - .collect() - } -} - -/// Returns the SQL string representation of the given [SortOptions] object. -#[inline] -fn to_str(options: &SortOptions) -> &str { - match (options.descending, options.nulls_first) { - (true, true) => "DESC", - (true, false) => "DESC NULLS LAST", - (false, true) => "ASC", - (false, false) => "ASC NULLS LAST", - } -} - -///`LexOrdering` is an alias for the type `Vec`, which represents -/// a lexicographical ordering. -pub type LexOrdering = Vec; - -///`LexOrderingRef` is an alias for the type &`[PhysicalSortExpr]`, which represents -/// a reference to a lexicographical ordering. -pub type LexOrderingRef<'a> = &'a [PhysicalSortExpr]; - -///`LexRequirement` is an alias for the type `Vec`, which -/// represents a lexicographical ordering requirement. -pub type LexRequirement = Vec; - -///`LexRequirementRef` is an alias for the type &`[PhysicalSortRequirement]`, which -/// represents a reference to a lexicographical ordering requirement. -pub type LexRequirementRef<'a> = &'a [PhysicalSortRequirement]; From 835f1477919483da2c5b5f4cf5bc4d5367fca0e9 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 3 Apr 2024 22:21:55 +0800 Subject: [PATCH 05/38] cleanup dependencies Signed-off-by: jayzhan211 --- datafusion/physical-expr-core/Cargo.toml | 32 +------------------ datafusion/physical-expr-core/src/lib.rs | 2 +- .../physical-expr-core/src/sort_expr.rs | 2 +- .../physical-expr-core/src/sort_properties.rs | 4 +-- datafusion/physical-expr-core/src/utils.rs | 5 ++- 5 files changed, 7 insertions(+), 38 deletions(-) diff --git a/datafusion/physical-expr-core/Cargo.toml b/datafusion/physical-expr-core/Cargo.toml index 5e6badc1c846..f4949a1b0d19 100644 --- a/datafusion/physical-expr-core/Cargo.toml +++ b/datafusion/physical-expr-core/Cargo.toml @@ -33,37 +33,7 @@ name = "datafusion_physical_expr_core" path = "src/lib.rs" [dependencies] -ahash = { version = "0.8", default-features = false, features = [ - "runtime-rng", -] } arrow = { workspace = true } -arrow-array = { workspace = true } -arrow-buffer = { workspace = true } -arrow-ord = { workspace = true } -arrow-schema = { workspace = true } -arrow-string = { workspace = true } -base64 = { version = "0.22", optional = true } -blake2 = { version = "^0.10.2", optional = true } -blake3 = { version = "1.0", optional = true } -chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } -datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -half = { workspace = true } -hashbrown = { version = "0.14", features = ["raw"] } -hex = { version = "0.4", optional = true } -indexmap = { workspace = true } -itertools = { workspace = true, features = ["use_std"] } -log = { workspace = true } -md-5 = { version = "^0.10.0", optional = true } -paste = "^1.0" -petgraph = "0.6.2" -rand = { workspace = true } -regex = { version = "1.8", optional = true } -sha2 = { version = "^0.10.1", optional = true } - -[dev-dependencies] -criterion = "0.5" -rand = { workspace = true } -rstest = { workspace = true } -tokio = { workspace = true, features = ["rt-multi-thread"] } +# itertools = { workspace = true, features = ["use_std"] } \ No newline at end of file diff --git a/datafusion/physical-expr-core/src/lib.rs b/datafusion/physical-expr-core/src/lib.rs index 9ccc35a351b8..275a386db81b 100644 --- a/datafusion/physical-expr-core/src/lib.rs +++ b/datafusion/physical-expr-core/src/lib.rs @@ -19,4 +19,4 @@ pub mod physical_expr; pub mod sort_expr; pub mod sort_properties; pub mod tree_node; -pub mod utils; \ No newline at end of file +pub mod utils; diff --git a/datafusion/physical-expr-core/src/sort_expr.rs b/datafusion/physical-expr-core/src/sort_expr.rs index 58ddb4303d2e..1e1187212d96 100644 --- a/datafusion/physical-expr-core/src/sort_expr.rs +++ b/datafusion/physical-expr-core/src/sort_expr.rs @@ -22,8 +22,8 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use arrow::compute::kernels::sort::{SortColumn, SortOptions}; +use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; -use arrow_schema::Schema; use datafusion_common::Result; use datafusion_expr::ColumnarValue; diff --git a/datafusion/physical-expr-core/src/sort_properties.rs b/datafusion/physical-expr-core/src/sort_properties.rs index 4df29ced2f01..47a5d5ba5e3b 100644 --- a/datafusion/physical-expr-core/src/sort_properties.rs +++ b/datafusion/physical-expr-core/src/sort_properties.rs @@ -17,9 +17,9 @@ use std::ops::Neg; -use crate::tree_node::ExprContext; +use arrow::compute::SortOptions; -use arrow_schema::SortOptions; +use crate::tree_node::ExprContext; /// To propagate [`SortOptions`] across the `PhysicalExpr`, it is insufficient /// to simply use `Option`: There must be a differentiation between diff --git a/datafusion/physical-expr-core/src/utils.rs b/datafusion/physical-expr-core/src/utils.rs index 612c43f4a794..799588de2318 100644 --- a/datafusion/physical-expr-core/src/utils.rs +++ b/datafusion/physical-expr-core/src/utils.rs @@ -16,10 +16,9 @@ // under the License. use arrow::{ - array::MutableArrayData, + array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData}, compute::{and_kleene, is_not_null, SlicesIterator}, }; -use arrow_array::{make_array, Array, ArrayRef, BooleanArray}; use datafusion_common::Result; /// Scatter `truthy` array by boolean mask. When the mask evaluates `true`, next values of `truthy` @@ -69,7 +68,7 @@ pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result { mod tests { use std::sync::Arc; - use arrow_array::Int32Array; + use arrow::array::Int32Array; use datafusion_common::cast::{as_boolean_array, as_int32_array}; use super::*; From c5d80c8fa10e921c4e34a00138b23fd0f09118dc Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 3 Apr 2024 22:25:56 +0800 Subject: [PATCH 06/38] add readme Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 16 -------------- datafusion/physical-expr-core/Cargo.toml | 1 - datafusion/physical-expr-core/README.md | 27 ++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 17 deletions(-) create mode 100644 datafusion/physical-expr-core/README.md diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index b97981889e6e..feb720c41fbc 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1350,25 +1350,9 @@ dependencies = [ name = "datafusion-physical-expr-core" version = "37.0.0" dependencies = [ - "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-ord", - "arrow-schema", - "arrow-string", - "chrono", "datafusion-common", - "datafusion-execution", "datafusion-expr", - "half", - "hashbrown 0.14.3", - "indexmap 2.2.6", - "itertools", - "log", - "paste", - "petgraph", - "rand", ] [[package]] diff --git a/datafusion/physical-expr-core/Cargo.toml b/datafusion/physical-expr-core/Cargo.toml index f4949a1b0d19..e5a09a787150 100644 --- a/datafusion/physical-expr-core/Cargo.toml +++ b/datafusion/physical-expr-core/Cargo.toml @@ -36,4 +36,3 @@ path = "src/lib.rs" arrow = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } -# itertools = { workspace = true, features = ["use_std"] } \ No newline at end of file diff --git a/datafusion/physical-expr-core/README.md b/datafusion/physical-expr-core/README.md new file mode 100644 index 000000000000..fbc671c4ac23 --- /dev/null +++ b/datafusion/physical-expr-core/README.md @@ -0,0 +1,27 @@ + + +# DataFusion Core Physical Expressions + +[DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. + +This crate is a submodule of DataFusion that provides the core functionality of physical expressions. +Like `PhysicalExpr` or `PhysicalSortExpr` and related things. + +[df]: https://crates.io/crates/datafusion From 7851de7576b32e429156e0f8e245bd9000b6a78f Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 3 Apr 2024 23:07:16 +0800 Subject: [PATCH 07/38] disable doc test Signed-off-by: jayzhan211 --- datafusion/physical-expr-core/src/physical_expr.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-expr-core/src/physical_expr.rs b/datafusion/physical-expr-core/src/physical_expr.rs index 67516034af44..f8fbe38600ce 100644 --- a/datafusion/physical-expr-core/src/physical_expr.rs +++ b/datafusion/physical-expr-core/src/physical_expr.rs @@ -47,8 +47,8 @@ use crate::utils::scatter; /// [`Expr`]: datafusion_expr::Expr /// /// # Example: Create `PhysicalExpr` from `Expr` -/// ``` -/// # use arrow_schema::{DataType, Field, Schema}; +/// ```no_run +/// # use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_common::DFSchema; /// # use datafusion_expr::{Expr, col, lit}; /// # use datafusion_physical_expr::create_physical_expr; @@ -65,10 +65,10 @@ use crate::utils::scatter; /// ``` /// /// # Example: Executing a PhysicalExpr to obtain [`ColumnarValue`] -/// ``` +/// ```no_run /// # use std::sync::Arc; -/// # use arrow_array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; -/// # use arrow_schema::{DataType, Field, Schema}; +/// # use arrow::array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; +/// # use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_common::{assert_batches_eq, DFSchema}; /// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; /// # use datafusion_physical_expr::create_physical_expr; From f5aafb35b8c49b367fd44e7b48fab990c02dae47 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 4 Apr 2024 08:28:36 +0800 Subject: [PATCH 08/38] move column Signed-off-by: jayzhan211 --- .../src/expressions/column.rs | 137 ++++++++++++++++++ .../physical-expr-core/src/expressions/mod.rs | 1 + datafusion/physical-expr-core/src/lib.rs | 1 + .../physical-expr-core/src/physical_expr.rs | 18 ++- .../physical-expr/src/expressions/column.rs | 104 ------------- .../physical-expr/src/expressions/mod.rs | 3 +- datafusion/physical-expr/src/physical_expr.rs | 17 +-- 7 files changed, 159 insertions(+), 122 deletions(-) create mode 100644 datafusion/physical-expr-core/src/expressions/column.rs create mode 100644 datafusion/physical-expr-core/src/expressions/mod.rs diff --git a/datafusion/physical-expr-core/src/expressions/column.rs b/datafusion/physical-expr-core/src/expressions/column.rs new file mode 100644 index 000000000000..edffc9161d18 --- /dev/null +++ b/datafusion/physical-expr-core/src/expressions/column.rs @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Column expression + +use std::any::Any; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; + +use arrow::{ + datatypes::{DataType, Schema}, + record_batch::RecordBatch, +}; +use datafusion_common::{internal_err, Result}; +use datafusion_expr::ColumnarValue; + +use crate::physical_expr::{down_cast_any_ref, PhysicalExpr}; + +/// Represents the column at a given index in a RecordBatch +#[derive(Debug, Hash, PartialEq, Eq, Clone)] +pub struct Column { + name: String, + index: usize, +} + +impl Column { + /// Create a new column expression + pub fn new(name: &str, index: usize) -> Self { + Self { + name: name.to_owned(), + index, + } + } + + /// Create a new column expression based on column name and schema + pub fn new_with_schema(name: &str, schema: &Schema) -> Result { + Ok(Column::new(name, schema.index_of(name)?)) + } + + /// Get the column name + pub fn name(&self) -> &str { + &self.name + } + + /// Get the column index + pub fn index(&self) -> usize { + self.index + } +} + +impl std::fmt::Display for Column { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}@{}", self.name, self.index) + } +} + +impl PhysicalExpr for Column { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn std::any::Any { + self + } + + /// Get the data type of this expression, given the schema of the input + fn data_type(&self, input_schema: &Schema) -> Result { + self.bounds_check(input_schema)?; + Ok(input_schema.field(self.index).data_type().clone()) + } + + /// Decide whehter this expression is nullable, given the schema of the input + fn nullable(&self, input_schema: &Schema) -> Result { + self.bounds_check(input_schema)?; + Ok(input_schema.field(self.index).is_nullable()) + } + + /// Evaluate the expression + fn evaluate(&self, batch: &RecordBatch) -> Result { + self.bounds_check(batch.schema().as_ref())?; + Ok(ColumnarValue::Array(batch.column(self.index).clone())) + } + + fn children(&self) -> Vec> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Ok(self) + } + + fn dyn_hash(&self, state: &mut dyn Hasher) { + let mut s = state; + self.hash(&mut s); + } +} + +impl PartialEq for Column { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| self == x) + .unwrap_or(false) + } +} + +impl Column { + fn bounds_check(&self, input_schema: &Schema) -> Result<()> { + if self.index < input_schema.fields.len() { + Ok(()) + } else { + internal_err!( + "PhysicalExpr Column references column '{}' at index {} (zero-based) but input schema only has {} columns: {:?}", + self.name, + self.index, input_schema.fields.len(), input_schema.fields().iter().map(|f| f.name().clone()).collect::>()) + } + } +} + +/// Create a column expression +pub fn col(name: &str, schema: &Schema) -> Result> { + Ok(Arc::new(Column::new_with_schema(name, schema)?)) +} \ No newline at end of file diff --git a/datafusion/physical-expr-core/src/expressions/mod.rs b/datafusion/physical-expr-core/src/expressions/mod.rs new file mode 100644 index 000000000000..b41b280eedf2 --- /dev/null +++ b/datafusion/physical-expr-core/src/expressions/mod.rs @@ -0,0 +1 @@ +pub mod column; \ No newline at end of file diff --git a/datafusion/physical-expr-core/src/lib.rs b/datafusion/physical-expr-core/src/lib.rs index 275a386db81b..3fe11a246b32 100644 --- a/datafusion/physical-expr-core/src/lib.rs +++ b/datafusion/physical-expr-core/src/lib.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +pub mod expressions; pub mod physical_expr; pub mod sort_expr; pub mod sort_properties; diff --git a/datafusion/physical-expr-core/src/physical_expr.rs b/datafusion/physical-expr-core/src/physical_expr.rs index f8fbe38600ce..fa1b44d98177 100644 --- a/datafusion/physical-expr-core/src/physical_expr.rs +++ b/datafusion/physical-expr-core/src/physical_expr.rs @@ -38,12 +38,12 @@ use crate::utils::scatter; /// `PhysicalExpr` are the physical counterpart to [`Expr`] used in logical /// planning, and can be evaluated directly on a [`RecordBatch`]. They are /// normally created from `Expr` by a [`PhysicalPlanner`] and can be created -/// directly using [`create_physical_expr`]. +/// directly using `create_physical_expr`. /// /// A Physical expression knows its type, nullability and how to evaluate itself. /// /// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html -/// [`create_physical_expr`]: crate::create_physical_expr +/// `create_physical_expr`: datafusion_physical_expr::create_physical_expr /// [`Expr`]: datafusion_expr::Expr /// /// # Example: Create `PhysicalExpr` from `Expr` @@ -252,3 +252,17 @@ pub fn with_new_children_if_necessary( Ok(expr) } } + +pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { + if any.is::>() { + any.downcast_ref::>() + .unwrap() + .as_any() + } else if any.is::>() { + any.downcast_ref::>() + .unwrap() + .as_any() + } else { + any + } +} \ No newline at end of file diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs index a07f36e785e3..ac2cfebcc157 100644 --- a/datafusion/physical-expr/src/expressions/column.rs +++ b/datafusion/physical-expr/src/expressions/column.rs @@ -31,106 +31,6 @@ use arrow::{ use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; -/// Represents the column at a given index in a RecordBatch -#[derive(Debug, Hash, PartialEq, Eq, Clone)] -pub struct Column { - name: String, - index: usize, -} - -impl Column { - /// Create a new column expression - pub fn new(name: &str, index: usize) -> Self { - Self { - name: name.to_owned(), - index, - } - } - - /// Create a new column expression based on column name and schema - pub fn new_with_schema(name: &str, schema: &Schema) -> Result { - Ok(Column::new(name, schema.index_of(name)?)) - } - - /// Get the column name - pub fn name(&self) -> &str { - &self.name - } - - /// Get the column index - pub fn index(&self) -> usize { - self.index - } -} - -impl std::fmt::Display for Column { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}@{}", self.name, self.index) - } -} - -impl PhysicalExpr for Column { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn std::any::Any { - self - } - - /// Get the data type of this expression, given the schema of the input - fn data_type(&self, input_schema: &Schema) -> Result { - self.bounds_check(input_schema)?; - Ok(input_schema.field(self.index).data_type().clone()) - } - - /// Decide whehter this expression is nullable, given the schema of the input - fn nullable(&self, input_schema: &Schema) -> Result { - self.bounds_check(input_schema)?; - Ok(input_schema.field(self.index).is_nullable()) - } - - /// Evaluate the expression - fn evaluate(&self, batch: &RecordBatch) -> Result { - self.bounds_check(batch.schema().as_ref())?; - Ok(ColumnarValue::Array(batch.column(self.index).clone())) - } - - fn children(&self) -> Vec> { - vec![] - } - - fn with_new_children( - self: Arc, - _children: Vec>, - ) -> Result> { - Ok(self) - } - - fn dyn_hash(&self, state: &mut dyn Hasher) { - let mut s = state; - self.hash(&mut s); - } -} - -impl PartialEq for Column { - fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) - .downcast_ref::() - .map(|x| self == x) - .unwrap_or(false) - } -} - -impl Column { - fn bounds_check(&self, input_schema: &Schema) -> Result<()> { - if self.index < input_schema.fields.len() { - Ok(()) - } else { - internal_err!( - "PhysicalExpr Column references column '{}' at index {} (zero-based) but input schema only has {} columns: {:?}", - self.name, - self.index, input_schema.fields.len(), input_schema.fields().iter().map(|f| f.name().clone()).collect::>()) - } - } -} #[derive(Debug, Hash, PartialEq, Eq, Clone)] pub struct UnKnownColumn { @@ -204,10 +104,6 @@ impl PartialEq for UnKnownColumn { } } -/// Create a column expression -pub fn col(name: &str, schema: &Schema) -> Result> { - Ok(Arc::new(Column::new_with_schema(name, schema)?)) -} #[cfg(test)] mod test { diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 7c4ea07dfbcb..a901f0257a45 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -80,7 +80,8 @@ pub use crate::PhysicalSortExpr; pub use binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; pub use cast::{cast, cast_with_options, CastExpr}; -pub use column::{col, Column, UnKnownColumn}; +pub use datafusion_physical_expr_core::expressions::column::{col, Column}; +pub use column::UnKnownColumn; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index c9e67acaf8ff..6785e1fe60f6 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -15,29 +15,16 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; use std::sync::Arc; use datafusion_physical_expr_core::physical_expr::PhysicalExpr; use itertools::izip; +pub use datafusion_physical_expr_core::physical_expr::down_cast_any_ref; + /// Shared [`PhysicalExpr`]. pub type PhysicalExprRef = Arc; -pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { - if any.is::>() { - any.downcast_ref::>() - .unwrap() - .as_any() - } else if any.is::>() { - any.downcast_ref::>() - .unwrap() - .as_any() - } else { - any - } -} - /// This function is similar to the `contains` method of `Vec`. It finds /// whether `expr` is among `physical_exprs`. pub fn physical_exprs_contains( From 7bfc074d12de67628a3cf9a8540ef1a1d3fd11c3 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 4 Apr 2024 08:43:50 +0800 Subject: [PATCH 09/38] fmt Signed-off-by: jayzhan211 --- datafusion/physical-expr-core/src/expressions/column.rs | 2 +- datafusion/physical-expr-core/src/expressions/mod.rs | 2 +- datafusion/physical-expr-core/src/physical_expr.rs | 8 ++++---- datafusion/physical-expr/src/expressions/column.rs | 2 -- datafusion/physical-expr/src/expressions/mod.rs | 2 +- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-expr-core/src/expressions/column.rs b/datafusion/physical-expr-core/src/expressions/column.rs index edffc9161d18..2cd52d6332fb 100644 --- a/datafusion/physical-expr-core/src/expressions/column.rs +++ b/datafusion/physical-expr-core/src/expressions/column.rs @@ -134,4 +134,4 @@ impl Column { /// Create a column expression pub fn col(name: &str, schema: &Schema) -> Result> { Ok(Arc::new(Column::new_with_schema(name, schema)?)) -} \ No newline at end of file +} diff --git a/datafusion/physical-expr-core/src/expressions/mod.rs b/datafusion/physical-expr-core/src/expressions/mod.rs index b41b280eedf2..6b55f6606152 100644 --- a/datafusion/physical-expr-core/src/expressions/mod.rs +++ b/datafusion/physical-expr-core/src/expressions/mod.rs @@ -1 +1 @@ -pub mod column; \ No newline at end of file +pub mod column; diff --git a/datafusion/physical-expr-core/src/physical_expr.rs b/datafusion/physical-expr-core/src/physical_expr.rs index fa1b44d98177..f1a6ab2e4486 100644 --- a/datafusion/physical-expr-core/src/physical_expr.rs +++ b/datafusion/physical-expr-core/src/physical_expr.rs @@ -38,13 +38,13 @@ use crate::utils::scatter; /// `PhysicalExpr` are the physical counterpart to [`Expr`] used in logical /// planning, and can be evaluated directly on a [`RecordBatch`]. They are /// normally created from `Expr` by a [`PhysicalPlanner`] and can be created -/// directly using `create_physical_expr`. +/// directly using [`create_physical_expr`]. /// /// A Physical expression knows its type, nullability and how to evaluate itself. /// -/// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html -/// `create_physical_expr`: datafusion_physical_expr::create_physical_expr /// [`Expr`]: datafusion_expr::Expr +/// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html +/// [`create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html /// /// # Example: Create `PhysicalExpr` from `Expr` /// ```no_run @@ -265,4 +265,4 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { } else { any } -} \ No newline at end of file +} diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs index ac2cfebcc157..634a56d1d683 100644 --- a/datafusion/physical-expr/src/expressions/column.rs +++ b/datafusion/physical-expr/src/expressions/column.rs @@ -31,7 +31,6 @@ use arrow::{ use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; - #[derive(Debug, Hash, PartialEq, Eq, Clone)] pub struct UnKnownColumn { name: String, @@ -104,7 +103,6 @@ impl PartialEq for UnKnownColumn { } } - #[cfg(test)] mod test { use crate::expressions::Column; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index a901f0257a45..13932cde6d1a 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -80,8 +80,8 @@ pub use crate::PhysicalSortExpr; pub use binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; pub use cast::{cast, cast_with_options, CastExpr}; -pub use datafusion_physical_expr_core::expressions::column::{col, Column}; pub use column::UnKnownColumn; +pub use datafusion_physical_expr_core::expressions::column::{col, Column}; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; From 675d2fe66a66501c24ca059111417868a90d86da Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 4 Apr 2024 09:00:17 +0800 Subject: [PATCH 10/38] move aggregatexp Signed-off-by: jayzhan211 --- .../physical-expr-core/src/aggregate/mod.rs | 102 ++++++++++++++++++ .../physical-expr-core/src/aggregate/utils.rs | 37 +++++++ datafusion/physical-expr-core/src/lib.rs | 1 + datafusion/physical-expr/src/aggregate/mod.rs | 80 +------------- .../physical-expr/src/aggregate/utils.rs | 22 +--- datafusion/physical-expr/src/lib.rs | 2 +- 6 files changed, 145 insertions(+), 99 deletions(-) create mode 100644 datafusion/physical-expr-core/src/aggregate/mod.rs create mode 100644 datafusion/physical-expr-core/src/aggregate/utils.rs diff --git a/datafusion/physical-expr-core/src/aggregate/mod.rs b/datafusion/physical-expr-core/src/aggregate/mod.rs new file mode 100644 index 000000000000..579f51815d84 --- /dev/null +++ b/datafusion/physical-expr-core/src/aggregate/mod.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod utils; + +use std::any::Any; +use std::fmt::Debug; +use std::sync::Arc; + +use crate::physical_expr::PhysicalExpr; +use crate::sort_expr::PhysicalSortExpr; + +use arrow::datatypes::Field; +use datafusion_common::{not_impl_err, Result}; +use datafusion_expr::{Accumulator, GroupsAccumulator}; + +/// An aggregate expression that: +/// * knows its resulting field +/// * knows how to create its accumulator +/// * knows its accumulator's state's field +/// * knows the expressions from whose its accumulator will receive values +/// +/// Any implementation of this trait also needs to implement the +/// `PartialEq` to allows comparing equality between the +/// trait objects. +pub trait AggregateExpr: Send + Sync + Debug + PartialEq { + /// Returns the aggregate expression as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + /// the field of the final result of this aggregation. + fn field(&self) -> Result; + + /// the accumulator used to accumulate values from the expressions. + /// the accumulator expects the same number of arguments as `expressions` and must + /// return states with the same description as `state_fields` + fn create_accumulator(&self) -> Result>; + + /// the fields that encapsulate the Accumulator's state + /// the number of fields here equals the number of states that the accumulator contains + fn state_fields(&self) -> Result>; + + /// expressions that are passed to the Accumulator. + /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. + fn expressions(&self) -> Vec>; + + /// Order by requirements for the aggregate function + /// By default it is `None` (there is no requirement) + /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this + fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + None + } + + /// Human readable name such as `"MIN(c2)"`. The default + /// implementation returns placeholder text. + fn name(&self) -> &str { + "AggregateExpr: default name" + } + + /// If the aggregate expression has a specialized + /// [`GroupsAccumulator`] implementation. If this returns true, + /// `[Self::create_groups_accumulator`] will be called. + fn groups_accumulator_supported(&self) -> bool { + false + } + + /// Return a specialized [`GroupsAccumulator`] that manages state + /// for all groups. + /// + /// For maximum performance, a [`GroupsAccumulator`] should be + /// implemented in addition to [`Accumulator`]. + fn create_groups_accumulator(&self) -> Result> { + not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") + } + + /// Construct an expression that calculates the aggregate in reverse. + /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). + /// For aggregates that do not support calculation in reverse, + /// returns None (which is the default value). + fn reverse_expr(&self) -> Option> { + None + } + + /// Creates accumulator implementation that supports retract + fn create_sliding_accumulator(&self) -> Result> { + not_impl_err!("Retractable Accumulator hasn't been implemented for {self:?} yet") + } +} diff --git a/datafusion/physical-expr-core/src/aggregate/utils.rs b/datafusion/physical-expr-core/src/aggregate/utils.rs new file mode 100644 index 000000000000..12aa7d298bbb --- /dev/null +++ b/datafusion/physical-expr-core/src/aggregate/utils.rs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, sync::Arc}; + +use super::AggregateExpr; + +/// Downcast a `Box` or `Arc` +/// and return the inner trait object as [`Any`] so +/// that it can be downcast to a specific implementation. +/// +/// This method is used when implementing the `PartialEq` +/// for [`AggregateExpr`] aggregation expressions and allows comparing the equality +/// between the trait objects. +pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { + if let Some(obj) = any.downcast_ref::>() { + obj.as_any() + } else if let Some(obj) = any.downcast_ref::>() { + obj.as_any() + } else { + any + } +} diff --git a/datafusion/physical-expr-core/src/lib.rs b/datafusion/physical-expr-core/src/lib.rs index 3fe11a246b32..53e3134a1b05 100644 --- a/datafusion/physical-expr-core/src/lib.rs +++ b/datafusion/physical-expr-core/src/lib.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +pub mod aggregate; pub mod expressions; pub mod physical_expr; pub mod sort_expr; diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 893178f29d08..8ad9b8b98577 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -15,16 +15,11 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::fmt::Debug; use std::sync::Arc; use crate::expressions::{NthValueAgg, OrderSensitiveArrayAgg}; -use crate::{PhysicalExpr, PhysicalSortExpr}; -use arrow::datatypes::Field; -use datafusion_common::{not_impl_err, Result}; -use datafusion_expr::{Accumulator, GroupsAccumulator}; +pub use datafusion_physical_expr_core::aggregate::AggregateExpr; mod hyperloglog; mod tdigest; @@ -62,79 +57,6 @@ pub mod build_in; pub mod moving_min_max; pub mod utils; -/// An aggregate expression that: -/// * knows its resulting field -/// * knows how to create its accumulator -/// * knows its accumulator's state's field -/// * knows the expressions from whose its accumulator will receive values -/// -/// Any implementation of this trait also needs to implement the -/// `PartialEq` to allows comparing equality between the -/// trait objects. -pub trait AggregateExpr: Send + Sync + Debug + PartialEq { - /// Returns the aggregate expression as [`Any`] so that it can be - /// downcast to a specific implementation. - fn as_any(&self) -> &dyn Any; - - /// the field of the final result of this aggregation. - fn field(&self) -> Result; - - /// the accumulator used to accumulate values from the expressions. - /// the accumulator expects the same number of arguments as `expressions` and must - /// return states with the same description as `state_fields` - fn create_accumulator(&self) -> Result>; - - /// the fields that encapsulate the Accumulator's state - /// the number of fields here equals the number of states that the accumulator contains - fn state_fields(&self) -> Result>; - - /// expressions that are passed to the Accumulator. - /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. - fn expressions(&self) -> Vec>; - - /// Order by requirements for the aggregate function - /// By default it is `None` (there is no requirement) - /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - /// Human readable name such as `"MIN(c2)"`. The default - /// implementation returns placeholder text. - fn name(&self) -> &str { - "AggregateExpr: default name" - } - - /// If the aggregate expression has a specialized - /// [`GroupsAccumulator`] implementation. If this returns true, - /// `[Self::create_groups_accumulator`] will be called. - fn groups_accumulator_supported(&self) -> bool { - false - } - - /// Return a specialized [`GroupsAccumulator`] that manages state - /// for all groups. - /// - /// For maximum performance, a [`GroupsAccumulator`] should be - /// implemented in addition to [`Accumulator`]. - fn create_groups_accumulator(&self) -> Result> { - not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") - } - - /// Construct an expression that calculates the aggregate in reverse. - /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). - /// For aggregates that do not support calculation in reverse, - /// returns None (which is the default value). - fn reverse_expr(&self) -> Option> { - None - } - - /// Creates accumulator implementation that supports retract - fn create_sliding_accumulator(&self) -> Result> { - not_impl_err!("Retractable Accumulator hasn't been implemented for {self:?} yet") - } -} - /// Checks whether the given aggregate expression is order-sensitive. /// For instance, a `SUM` aggregation doesn't depend on the order of its inputs. /// However, an `ARRAY_AGG` with `ORDER BY` depends on the input ordering. diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs index 613f6118e907..ed473fc49397 100644 --- a/datafusion/physical-expr/src/aggregate/utils.rs +++ b/datafusion/physical-expr/src/aggregate/utils.rs @@ -17,10 +17,11 @@ //! Utilities used in aggregates -use std::any::Any; use std::sync::Arc; -use crate::{AggregateExpr, PhysicalSortExpr}; +pub use datafusion_physical_expr_core::aggregate::utils::down_cast_any_ref; + +use crate::PhysicalSortExpr; use arrow::array::{ArrayRef, ArrowNativeTypeOp}; use arrow_array::cast::AsArray; @@ -170,23 +171,6 @@ pub fn adjust_output_array( Ok(array) } -/// Downcast a `Box` or `Arc` -/// and return the inner trait object as [`Any`] so -/// that it can be downcast to a specific implementation. -/// -/// This method is used when implementing the `PartialEq` -/// for [`AggregateExpr`] aggregation expressions and allows comparing the equality -/// between the trait objects. -pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { - if let Some(obj) = any.downcast_ref::>() { - obj.as_any() - } else if let Some(obj) = any.downcast_ref::>() { - obj.as_any() - } else { - any - } -} - /// Construct corresponding fields for lexicographical ordering requirement expression pub fn ordering_fields( ordering_req: &[PhysicalSortExpr], diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 7818f8ba7adf..0935f9d6d9e8 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -40,8 +40,8 @@ pub mod execution_props { } pub use aggregate::groups_accumulator::{GroupsAccumulatorAdapter, NullState}; -pub use aggregate::AggregateExpr; pub use analysis::{analyze, AnalysisContext, ExprBoundaries}; +pub use datafusion_physical_expr_core::aggregate::AggregateExpr; pub use equivalence::EquivalenceProperties; pub use partitioning::{Distribution, Partitioning}; pub use physical_expr::{ From 5220087d5e7c70176cdc496b73ea5dd678a898ea Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 4 Apr 2024 09:03:52 +0800 Subject: [PATCH 11/38] move other two utils Signed-off-by: jayzhan211 --- .../physical-expr-core/src/aggregate/utils.rs | 32 +++++++++++++++++++ .../physical-expr/src/aggregate/utils.rs | 32 +++---------------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/datafusion/physical-expr-core/src/aggregate/utils.rs b/datafusion/physical-expr-core/src/aggregate/utils.rs index 12aa7d298bbb..9821ba626b18 100644 --- a/datafusion/physical-expr-core/src/aggregate/utils.rs +++ b/datafusion/physical-expr-core/src/aggregate/utils.rs @@ -17,6 +17,13 @@ use std::{any::Any, sync::Arc}; +use arrow::{ + compute::SortOptions, + datatypes::{DataType, Field}, +}; + +use crate::sort_expr::PhysicalSortExpr; + use super::AggregateExpr; /// Downcast a `Box` or `Arc` @@ -35,3 +42,28 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { any } } + +/// Construct corresponding fields for lexicographical ordering requirement expression +pub fn ordering_fields( + ordering_req: &[PhysicalSortExpr], + // Data type of each expression in the ordering requirement + data_types: &[DataType], +) -> Vec { + ordering_req + .iter() + .zip(data_types.iter()) + .map(|(sort_expr, dtype)| { + Field::new( + sort_expr.expr.to_string().as_str(), + dtype.clone(), + // Multi partitions may be empty hence field should be nullable. + true, + ) + }) + .collect() +} + +/// Selects the sort option attribute from all the given `PhysicalSortExpr`s. +pub fn get_sort_options(ordering_req: &[PhysicalSortExpr]) -> Vec { + ordering_req.iter().map(|item| item.options).collect() +} diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs index ed473fc49397..a0db8e5158c4 100644 --- a/datafusion/physical-expr/src/aggregate/utils.rs +++ b/datafusion/physical-expr/src/aggregate/utils.rs @@ -19,9 +19,10 @@ use std::sync::Arc; +// For backwards compatibility pub use datafusion_physical_expr_core::aggregate::utils::down_cast_any_ref; - -use crate::PhysicalSortExpr; +pub use datafusion_physical_expr_core::aggregate::utils::get_sort_options; +pub use datafusion_physical_expr_core::aggregate::utils::ordering_fields; use arrow::array::{ArrayRef, ArrowNativeTypeOp}; use arrow_array::cast::AsArray; @@ -30,7 +31,7 @@ use arrow_array::types::{ TimestampNanosecondType, TimestampSecondType, }; use arrow_buffer::{ArrowNativeType, ToByteSlice}; -use arrow_schema::{DataType, Field, SortOptions}; +use arrow_schema::DataType; use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::Accumulator; @@ -171,31 +172,6 @@ pub fn adjust_output_array( Ok(array) } -/// Construct corresponding fields for lexicographical ordering requirement expression -pub fn ordering_fields( - ordering_req: &[PhysicalSortExpr], - // Data type of each expression in the ordering requirement - data_types: &[DataType], -) -> Vec { - ordering_req - .iter() - .zip(data_types.iter()) - .map(|(sort_expr, dtype)| { - Field::new( - sort_expr.expr.to_string().as_str(), - dtype.clone(), - // Multi partitions may be empty hence field should be nullable. - true, - ) - }) - .collect() -} - -/// Selects the sort option attribute from all the given `PhysicalSortExpr`s. -pub fn get_sort_options(ordering_req: &[PhysicalSortExpr]) -> Vec { - ordering_req.iter().map(|item| item.options).collect() -} - /// A wrapper around a type to provide hash for floats #[derive(Copy, Clone, Debug)] pub(crate) struct Hashable(pub T); From 113a000ef7127f568208c2d2e0f8392a1a170ba1 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 4 Apr 2024 10:17:59 +0800 Subject: [PATCH 12/38] license Signed-off-by: jayzhan211 --- .../physical-expr-core/src/expressions/mod.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion/physical-expr-core/src/expressions/mod.rs b/datafusion/physical-expr-core/src/expressions/mod.rs index 6b55f6606152..d102422081dc 100644 --- a/datafusion/physical-expr-core/src/expressions/mod.rs +++ b/datafusion/physical-expr-core/src/expressions/mod.rs @@ -1 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + pub mod column; From fea87e3633ab4049cc306160d73fc58f4664dad9 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 4 Apr 2024 10:45:05 +0800 Subject: [PATCH 13/38] switch to ignore Signed-off-by: jayzhan211 --- datafusion/physical-expr-core/src/physical_expr.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr-core/src/physical_expr.rs b/datafusion/physical-expr-core/src/physical_expr.rs index f1a6ab2e4486..b5ee582ab741 100644 --- a/datafusion/physical-expr-core/src/physical_expr.rs +++ b/datafusion/physical-expr-core/src/physical_expr.rs @@ -47,7 +47,7 @@ use crate::utils::scatter; /// [`create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html /// /// # Example: Create `PhysicalExpr` from `Expr` -/// ```no_run +/// ```ignore /// # use arrow::datatypes::{DataType, Field, Schema}; /// # use datafusion_common::DFSchema; /// # use datafusion_expr::{Expr, col, lit}; @@ -65,7 +65,7 @@ use crate::utils::scatter; /// ``` /// /// # Example: Executing a PhysicalExpr to obtain [`ColumnarValue`] -/// ```no_run +/// ```ignore /// # use std::sync::Arc; /// # use arrow::array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; /// # use arrow::datatypes::{DataType, Field, Schema}; From 06d87bc3ab5a8bf1904f509e1726991f573da047 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Thu, 4 Apr 2024 10:46:55 +0800 Subject: [PATCH 14/38] move reverse order Signed-off-by: jayzhan211 --- datafusion/physical-expr-core/src/utils.rs | 15 +++++++++++++++ datafusion/physical-expr/src/lib.rs | 3 ++- datafusion/physical-expr/src/utils/mod.rs | 13 ------------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/datafusion/physical-expr-core/src/utils.rs b/datafusion/physical-expr-core/src/utils.rs index 799588de2318..459b5a4849cb 100644 --- a/datafusion/physical-expr-core/src/utils.rs +++ b/datafusion/physical-expr-core/src/utils.rs @@ -21,6 +21,8 @@ use arrow::{ }; use datafusion_common::Result; +use crate::sort_expr::PhysicalSortExpr; + /// Scatter `truthy` array by boolean mask. When the mask evaluates `true`, next values of `truthy` /// are taken, when the mask evaluates `false` values null values are filled. /// @@ -64,6 +66,19 @@ pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result { Ok(make_array(data)) } +/// Reverses the ORDER BY expression, which is useful during equivalent window +/// expression construction. For instance, 'ORDER BY a ASC, NULLS LAST' turns into +/// 'ORDER BY a DESC, NULLS FIRST'. +pub fn reverse_order_bys(order_bys: &[PhysicalSortExpr]) -> Vec { + order_bys + .iter() + .map(|e| PhysicalSortExpr { + expr: e.expr.clone(), + options: !e.options, + }) + .collect() +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 0935f9d6d9e8..2da2a76ec0ca 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -58,7 +58,8 @@ pub use datafusion_physical_expr_core::sort_expr::{ pub use planner::{create_physical_expr, create_physical_exprs}; pub use scalar_function::ScalarFunctionExpr; -pub use utils::{reverse_order_bys, split_conjunction}; +pub use datafusion_physical_expr_core::utils::reverse_order_bys; +pub use utils::split_conjunction; pub use aggregate::first_last::create_first_value_accumulator; diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs index 0aea05556697..e55bc3d15665 100644 --- a/datafusion/physical-expr/src/utils/mod.rs +++ b/datafusion/physical-expr/src/utils/mod.rs @@ -243,19 +243,6 @@ pub fn reassign_predicate_columns( .data() } -/// Reverses the ORDER BY expression, which is useful during equivalent window -/// expression construction. For instance, 'ORDER BY a ASC, NULLS LAST' turns into -/// 'ORDER BY a DESC, NULLS FIRST'. -pub fn reverse_order_bys(order_bys: &[PhysicalSortExpr]) -> Vec { - order_bys - .iter() - .map(|e| PhysicalSortExpr { - expr: e.expr.clone(), - options: !e.options, - }) - .collect() -} - /// Merge left and right sort expressions, checking for duplicates. pub fn merge_vectors( left: &[PhysicalSortExpr], From 26e57821fa06a7beb1769f23a7514a6ab758ce34 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 07:49:05 +0800 Subject: [PATCH 15/38] rename to common Signed-off-by: jayzhan211 --- Cargo.toml | 4 ++-- .../Cargo.toml | 4 ++-- .../README.md | 4 ++-- .../src/aggregate/mod.rs | 0 .../src/aggregate/utils.rs | 0 .../src/expressions/column.rs | 0 .../src/expressions/mod.rs | 0 .../src/lib.rs | 0 .../src/physical_expr.rs | 0 .../src/sort_expr.rs | 0 .../src/sort_properties.rs | 0 .../src/tree_node.rs | 0 .../src/utils.rs | 0 datafusion/physical-expr/Cargo.toml | 2 +- datafusion/physical-expr/src/aggregate/mod.rs | 2 +- datafusion/physical-expr/src/aggregate/utils.rs | 6 +++--- datafusion/physical-expr/src/lib.rs | 12 ++++++------ datafusion/physical-expr/src/physical_expr.rs | 4 ++-- 18 files changed, 19 insertions(+), 19 deletions(-) rename datafusion/{physical-expr-core => physical-expr-common}/Cargo.toml (94%) rename datafusion/{physical-expr-core => physical-expr-common}/README.md (85%) rename datafusion/{physical-expr-core => physical-expr-common}/src/aggregate/mod.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/aggregate/utils.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/expressions/column.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/expressions/mod.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/lib.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/physical_expr.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/sort_expr.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/sort_properties.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/tree_node.rs (100%) rename datafusion/{physical-expr-core => physical-expr-common}/src/utils.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 64c228c89870..ca34ea9c2a24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ members = [ "datafusion/functions", "datafusion/functions-array", "datafusion/optimizer", - "datafusion/physical-expr-core", + "datafusion/physical-expr-common", "datafusion/physical-expr", "datafusion/physical-plan", "datafusion/proto", @@ -81,7 +81,7 @@ datafusion-functions = { path = "datafusion/functions", version = "37.0.0" } datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" } datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false } datafusion-physical-expr = { path = "datafusion/physical-expr", version = "37.0.0", default-features = false } -datafusion-physical-expr-core = { path = "datafusion/physical-expr-core", version = "37.0.0", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "37.0.0", default-features = false } datafusion-physical-plan = { path = "datafusion/physical-plan", version = "37.0.0" } datafusion-proto = { path = "datafusion/proto", version = "37.0.0" } datafusion-sql = { path = "datafusion/sql", version = "37.0.0" } diff --git a/datafusion/physical-expr-core/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml similarity index 94% rename from datafusion/physical-expr-core/Cargo.toml rename to datafusion/physical-expr-common/Cargo.toml index e5a09a787150..1a347643e6eb 100644 --- a/datafusion/physical-expr-core/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -16,7 +16,7 @@ # under the License. [package] -name = "datafusion-physical-expr-core" +name = "datafusion-physical-expr-common" description = "Core physical expression implementation for DataFusion query engine" keywords = ["arrow", "query", "sql"] readme = "README.md" @@ -29,7 +29,7 @@ authors = { workspace = true } rust-version = { workspace = true } [lib] -name = "datafusion_physical_expr_core" +name = "datafusion_physical_expr_common" path = "src/lib.rs" [dependencies] diff --git a/datafusion/physical-expr-core/README.md b/datafusion/physical-expr-common/README.md similarity index 85% rename from datafusion/physical-expr-core/README.md rename to datafusion/physical-expr-common/README.md index fbc671c4ac23..7a1eff77d3b4 100644 --- a/datafusion/physical-expr-core/README.md +++ b/datafusion/physical-expr-common/README.md @@ -21,7 +21,7 @@ [DataFusion][df] is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. -This crate is a submodule of DataFusion that provides the core functionality of physical expressions. -Like `PhysicalExpr` or `PhysicalSortExpr` and related things. +This crate is a submodule of DataFusion that provides shared APIs for implementing +physical expressions such as `PhysicalExpr` and `PhysicalSortExpr`. [df]: https://crates.io/crates/datafusion diff --git a/datafusion/physical-expr-core/src/aggregate/mod.rs b/datafusion/physical-expr-common/src/aggregate/mod.rs similarity index 100% rename from datafusion/physical-expr-core/src/aggregate/mod.rs rename to datafusion/physical-expr-common/src/aggregate/mod.rs diff --git a/datafusion/physical-expr-core/src/aggregate/utils.rs b/datafusion/physical-expr-common/src/aggregate/utils.rs similarity index 100% rename from datafusion/physical-expr-core/src/aggregate/utils.rs rename to datafusion/physical-expr-common/src/aggregate/utils.rs diff --git a/datafusion/physical-expr-core/src/expressions/column.rs b/datafusion/physical-expr-common/src/expressions/column.rs similarity index 100% rename from datafusion/physical-expr-core/src/expressions/column.rs rename to datafusion/physical-expr-common/src/expressions/column.rs diff --git a/datafusion/physical-expr-core/src/expressions/mod.rs b/datafusion/physical-expr-common/src/expressions/mod.rs similarity index 100% rename from datafusion/physical-expr-core/src/expressions/mod.rs rename to datafusion/physical-expr-common/src/expressions/mod.rs diff --git a/datafusion/physical-expr-core/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs similarity index 100% rename from datafusion/physical-expr-core/src/lib.rs rename to datafusion/physical-expr-common/src/lib.rs diff --git a/datafusion/physical-expr-core/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs similarity index 100% rename from datafusion/physical-expr-core/src/physical_expr.rs rename to datafusion/physical-expr-common/src/physical_expr.rs diff --git a/datafusion/physical-expr-core/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs similarity index 100% rename from datafusion/physical-expr-core/src/sort_expr.rs rename to datafusion/physical-expr-common/src/sort_expr.rs diff --git a/datafusion/physical-expr-core/src/sort_properties.rs b/datafusion/physical-expr-common/src/sort_properties.rs similarity index 100% rename from datafusion/physical-expr-core/src/sort_properties.rs rename to datafusion/physical-expr-common/src/sort_properties.rs diff --git a/datafusion/physical-expr-core/src/tree_node.rs b/datafusion/physical-expr-common/src/tree_node.rs similarity index 100% rename from datafusion/physical-expr-core/src/tree_node.rs rename to datafusion/physical-expr-common/src/tree_node.rs diff --git a/datafusion/physical-expr-core/src/utils.rs b/datafusion/physical-expr-common/src/utils.rs similarity index 100% rename from datafusion/physical-expr-core/src/utils.rs rename to datafusion/physical-expr-common/src/utils.rs diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index ec802ae10bc2..a345b9b59727 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -59,7 +59,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-physical-expr-core = { workspace = true } +datafusion-physical-expr-common = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", optional = true } diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 8ad9b8b98577..e176084ae6ec 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use crate::expressions::{NthValueAgg, OrderSensitiveArrayAgg}; -pub use datafusion_physical_expr_core::aggregate::AggregateExpr; +pub use datafusion_physical_expr_common::aggregate::AggregateExpr; mod hyperloglog; mod tdigest; diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs index a0db8e5158c4..d14a52f5752d 100644 --- a/datafusion/physical-expr/src/aggregate/utils.rs +++ b/datafusion/physical-expr/src/aggregate/utils.rs @@ -20,9 +20,9 @@ use std::sync::Arc; // For backwards compatibility -pub use datafusion_physical_expr_core::aggregate::utils::down_cast_any_ref; -pub use datafusion_physical_expr_core::aggregate::utils::get_sort_options; -pub use datafusion_physical_expr_core::aggregate::utils::ordering_fields; +pub use datafusion_physical_expr_common::aggregate::utils::down_cast_any_ref; +pub use datafusion_physical_expr_common::aggregate::utils::get_sort_options; +pub use datafusion_physical_expr_common::aggregate::utils::ordering_fields; use arrow::array::{ArrayRef, ArrowNativeTypeOp}; use arrow_array::cast::AsArray; diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 2da2a76ec0ca..c88f1b32bbc6 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -41,7 +41,7 @@ pub mod execution_props { pub use aggregate::groups_accumulator::{GroupsAccumulatorAdapter, NullState}; pub use analysis::{analyze, AnalysisContext, ExprBoundaries}; -pub use datafusion_physical_expr_core::aggregate::AggregateExpr; +pub use datafusion_physical_expr_common::aggregate::AggregateExpr; pub use equivalence::EquivalenceProperties; pub use partitioning::{Distribution, Partitioning}; pub use physical_expr::{ @@ -49,8 +49,8 @@ pub use physical_expr::{ PhysicalExprRef, }; -pub use datafusion_physical_expr_core::physical_expr::PhysicalExpr; -pub use datafusion_physical_expr_core::sort_expr::{ +pub use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +pub use datafusion_physical_expr_common::sort_expr::{ LexOrdering, LexOrderingRef, LexRequirement, LexRequirementRef, PhysicalSortExpr, PhysicalSortRequirement, }; @@ -58,19 +58,19 @@ pub use datafusion_physical_expr_core::sort_expr::{ pub use planner::{create_physical_expr, create_physical_exprs}; pub use scalar_function::ScalarFunctionExpr; -pub use datafusion_physical_expr_core::utils::reverse_order_bys; +pub use datafusion_physical_expr_common::utils::reverse_order_bys; pub use utils::split_conjunction; pub use aggregate::first_last::create_first_value_accumulator; // For backwards compatibility pub mod sort_properties { - pub use datafusion_physical_expr_core::sort_properties::{ + pub use datafusion_physical_expr_common::sort_properties::{ ExprOrdering, SortProperties, }; } // For backwards compatibility pub mod tree_node { - pub use datafusion_physical_expr_core::tree_node::ExprContext; + pub use datafusion_physical_expr_common::tree_node::ExprContext; } diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index 6785e1fe60f6..bc265d3819a5 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -17,10 +17,10 @@ use std::sync::Arc; -use datafusion_physical_expr_core::physical_expr::PhysicalExpr; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use itertools::izip; -pub use datafusion_physical_expr_core::physical_expr::down_cast_any_ref; +pub use datafusion_physical_expr_common::physical_expr::down_cast_any_ref; /// Shared [`PhysicalExpr`]. pub type PhysicalExprRef = Arc; From 26f852c2c53c59281d1c455a9204ce2163ca33dd Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 08:15:33 +0800 Subject: [PATCH 16/38] cleanup Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 4 +- datafusion/physical-expr-common/Cargo.toml | 2 +- .../physical-expr-common/src/physical_expr.rs | 61 +------------------ .../physical-expr/src/expressions/mod.rs | 2 +- datafusion/physical-expr/src/planner.rs | 61 +++++++++++++++++++ 5 files changed, 67 insertions(+), 63 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index feb720c41fbc..d744a891c6a6 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1331,7 +1331,7 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr-core", + "datafusion-physical-expr-common", "half", "hashbrown 0.14.3", "hex", @@ -1347,7 +1347,7 @@ dependencies = [ ] [[package]] -name = "datafusion-physical-expr-core" +name = "datafusion-physical-expr-common" version = "37.0.0" dependencies = [ "arrow", diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index 1a347643e6eb..89a41a5d10ce 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-physical-expr-common" -description = "Core physical expression implementation for DataFusion query engine" +description = "Common functionality of physical expression for DataFusion query engine" keywords = ["arrow", "query", "sql"] readme = "README.md" version = { workspace = true } diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index b5ee582ab741..be6358e73c99 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -32,65 +32,8 @@ use datafusion_expr::ColumnarValue; use crate::sort_properties::SortProperties; use crate::utils::scatter; -/// `PhysicalExpr` evaluate DataFusion expressions such as `A + 1`, or `CAST(c1 -/// AS int)`. -/// -/// `PhysicalExpr` are the physical counterpart to [`Expr`] used in logical -/// planning, and can be evaluated directly on a [`RecordBatch`]. They are -/// normally created from `Expr` by a [`PhysicalPlanner`] and can be created -/// directly using [`create_physical_expr`]. -/// -/// A Physical expression knows its type, nullability and how to evaluate itself. -/// -/// [`Expr`]: datafusion_expr::Expr -/// [`PhysicalPlanner`]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html -/// [`create_physical_expr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html -/// -/// # Example: Create `PhysicalExpr` from `Expr` -/// ```ignore -/// # use arrow::datatypes::{DataType, Field, Schema}; -/// # use datafusion_common::DFSchema; -/// # use datafusion_expr::{Expr, col, lit}; -/// # use datafusion_physical_expr::create_physical_expr; -/// # use datafusion_expr::execution_props::ExecutionProps; -/// // For a logical expression `a = 1`, we can create a physical expression -/// let expr = col("a").eq(lit(1)); -/// // To create a PhysicalExpr we need 1. a schema -/// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); -/// let df_schema = DFSchema::try_from(schema).unwrap(); -/// // 2. ExecutionProps -/// let props = ExecutionProps::new(); -/// // We can now create a PhysicalExpr: -/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); -/// ``` -/// -/// # Example: Executing a PhysicalExpr to obtain [`ColumnarValue`] -/// ```ignore -/// # use std::sync::Arc; -/// # use arrow::array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; -/// # use arrow::datatypes::{DataType, Field, Schema}; -/// # use datafusion_common::{assert_batches_eq, DFSchema}; -/// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; -/// # use datafusion_physical_expr::create_physical_expr; -/// # use datafusion_expr::execution_props::ExecutionProps; -/// # let expr = col("a").eq(lit(1)); -/// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); -/// # let df_schema = DFSchema::try_from(schema.clone()).unwrap(); -/// # let props = ExecutionProps::new(); -/// // Given a PhysicalExpr, for `a = 1` we can evaluate it against a RecordBatch like this: -/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); -/// // Input of [1,2,3] -/// let input_batch = RecordBatch::try_from_iter(vec![ -/// ("a", Arc::new(Int32Array::from(vec![1, 2, 3])) as _) -/// ]).unwrap(); -/// // The result is a ColumnarValue (either an Array or a Scalar) -/// let result = physical_expr.evaluate(&input_batch).unwrap(); -/// // In this case, a BooleanArray with the result of the comparison -/// let ColumnarValue::Array(arr) = result else { -/// panic!("Expected an array") -/// }; -/// assert_eq!(arr.as_boolean(), &BooleanArray::from(vec![true, false, false])); -/// ``` +/// See [create_physical_expr](https://docs.rs/datafusion/latest/datafusion/physical_expr/fn.create_physical_expr.html) +/// for examples of creating `PhysicalExpr` from `Expr` pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq { /// Returns the physical expression as [`Any`] so that it can be /// downcast to a specific implementation. diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 13932cde6d1a..f0cc4b175ea5 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -81,7 +81,7 @@ pub use binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; pub use cast::{cast, cast_with_options, CastExpr}; pub use column::UnKnownColumn; -pub use datafusion_physical_expr_core::expressions::column::{col, Column}; +pub use datafusion_physical_expr_common::expressions::column::{col, Column}; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 0dbea09ffb51..44c9d33d6052 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -33,6 +33,67 @@ use datafusion_expr::{ }; use std::sync::Arc; +/// [PhysicalExpr] evaluate DataFusion expressions such as `A + 1`, or `CAST(c1 +/// AS int)`. +/// +/// [PhysicalExpr] are the physical counterpart to [Expr] used in logical +/// planning, and can be evaluated directly on a [RecordBatch]. They are +/// normally created from [Expr] by a [PhysicalPlanner] and can be created +/// directly using [create_physical_expr]. +/// +/// A Physical expression knows its type, nullability and how to evaluate itself. +/// +/// [PhysicalPlanner]: https://docs.rs/datafusion/latest/datafusion/physical_planner/trait.PhysicalPlanner.html +/// [RecordBatch]: https://docs.rs/arrow/latest/arrow/record_batch/struct.RecordBatch.html +/// +/// # Example: Create `PhysicalExpr` from `Expr` +/// ``` +/// # use arrow::datatypes::{DataType, Field, Schema}; +/// # use datafusion_common::DFSchema; +/// # use datafusion_expr::{Expr, col, lit}; +/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_expr::execution_props::ExecutionProps; +/// // For a logical expression `a = 1`, we can create a physical expression +/// let expr = col("a").eq(lit(1)); +/// // To create a PhysicalExpr we need 1. a schema +/// let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); +/// let df_schema = DFSchema::try_from(schema).unwrap(); +/// // 2. ExecutionProps +/// let props = ExecutionProps::new(); +/// // We can now create a PhysicalExpr: +/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); +/// ``` +/// +/// # Example: Executing a PhysicalExpr to obtain [ColumnarValue] +/// ``` +/// # use std::sync::Arc; +/// # use arrow::array::{cast::AsArray, BooleanArray, Int32Array, RecordBatch}; +/// # use arrow::datatypes::{DataType, Field, Schema}; +/// # use datafusion_common::{assert_batches_eq, DFSchema}; +/// # use datafusion_expr::{Expr, col, lit, ColumnarValue}; +/// # use datafusion_physical_expr::create_physical_expr; +/// # use datafusion_expr::execution_props::ExecutionProps; +/// # let expr = col("a").eq(lit(1)); +/// # let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); +/// # let df_schema = DFSchema::try_from(schema.clone()).unwrap(); +/// # let props = ExecutionProps::new(); +/// // Given a PhysicalExpr, for `a = 1` we can evaluate it against a RecordBatch like this: +/// let physical_expr = create_physical_expr(&expr, &df_schema, &props).unwrap(); +/// // Input of [1,2,3] +/// let input_batch = RecordBatch::try_from_iter(vec![ +/// ("a", Arc::new(Int32Array::from(vec![1, 2, 3])) as _) +/// ]).unwrap(); +/// // The result is a ColumnarValue (either an Array or a Scalar) +/// let result = physical_expr.evaluate(&input_batch).unwrap(); +/// // In this case, a BooleanArray with the result of the comparison +/// let ColumnarValue::Array(arr) = result else { +/// panic!("Expected an array") +/// }; +/// assert_eq!(arr.as_boolean(), &BooleanArray::from(vec![true, false, false])); +/// ``` +/// +/// [ColumnarValue]: datafusion_expr::ColumnarValue +/// /// Create a physical expression from a logical expression ([Expr]). /// /// # Arguments From 2bc58c1a8964bb0de303f85a540ff7b3e6159fcf Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 08:41:49 +0800 Subject: [PATCH 17/38] backup Signed-off-by: jayzhan211 --- datafusion/aggregate-functions/Cargo.toml | 22 +- .../aggregate-functions/src/first_last.rs | 233 +++++++++++++++++- datafusion/physical-expr/Cargo.toml | 1 + .../physical-expr/src/aggregate/first_last.rs | 218 +--------------- 4 files changed, 229 insertions(+), 245 deletions(-) diff --git a/datafusion/aggregate-functions/Cargo.toml b/datafusion/aggregate-functions/Cargo.toml index a733b4e8142c..6330da97f69b 100644 --- a/datafusion/aggregate-functions/Cargo.toml +++ b/datafusion/aggregate-functions/Cargo.toml @@ -36,26 +36,6 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } -base64 = { version = "0.22", optional = true } -blake2 = { version = "^0.10.2", optional = true } -blake3 = { version = "1.0", optional = true } -chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } -datafusion-expr = { workspace = true } -datafusion-physical-expr = { workspace = true, default-features = true } -hashbrown = { version = "0.14", features = ["raw"], optional = true } -hex = { version = "0.4", optional = true } -itertools = { workspace = true } -log = { workspace = true } -md-5 = { version = "^0.10.0", optional = true } -regex = { version = "1.8", optional = true } -sha2 = { version = "^0.10.1", optional = true } -unicode-segmentation = { version = "^1.7.1", optional = true } -uuid = { version = "1.7", features = ["v4"], optional = true } - -[dev-dependencies] -criterion = "0.5" -rand = { workspace = true } -rstest = { workspace = true } -tokio = { workspace = true, features = ["macros", "rt", "sync"] } \ No newline at end of file +datafusion-expr = { workspace = true } \ No newline at end of file diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs index e09089c5abe7..47237e5dd61e 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -25,13 +25,13 @@ use datafusion_expr::{Accumulator, AccumulatorFactoryFunction, AggregateUDF, Agg use std::any::Any; use std::fmt::Debug; -make_udf_function!( - FirstValue, - first_value, - value: Expr, - "Returns the first value in a group of values.", - first_value_fn -); +// make_udf_function!( +// FirstValue, +// first_value, +// value: Expr, +// "Returns the first value in a group of values.", +// first_value_fn +// ); pub struct FirstValue { @@ -114,4 +114,223 @@ pub fn create_first_value( accumulator: AccumulatorFactoryFunction, ) -> AggregateUDF { AggregateUDF::from(FirstValue::new(accumulator)) +} + +pub fn create_first_value_accumulator( + acc_args: AccumulatorArgs, +) -> Result> { + let mut all_sort_orders = vec![]; + + // Construct PhysicalSortExpr objects from Expr objects: + let mut sort_exprs = vec![]; + for expr in acc_args.sort_exprs { + if let Expr::Sort(sort) = expr { + if let Expr::Column(col) = sort.expr.as_ref() { + let name = &col.name; + let e = expressions::col(name, acc_args.schema)?; + sort_exprs.push(PhysicalSortExpr { + expr: e, + options: SortOptions { + descending: !sort.asc, + nulls_first: sort.nulls_first, + }, + }); + } + } + } + if !sort_exprs.is_empty() { + all_sort_orders.extend(sort_exprs); + } + + let ordering_req = all_sort_orders; + + let ordering_dtypes = ordering_req + .iter() + .map(|e| e.expr.data_type(acc_args.schema)) + .collect::>>()?; + + let requirement_satisfied = ordering_req.is_empty(); + + FirstValueAccumulator::try_new( + acc_args.data_type, + &ordering_dtypes, + ordering_req, + acc_args.ignore_nulls, + ) + .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _) +} + +#[derive(Debug)] +pub struct FirstValueAccumulator { + first: ScalarValue, + // At the beginning, `is_set` is false, which means `first` is not seen yet. + // Once we see the first value, we set the `is_set` flag and do not update `first` anymore. + is_set: bool, + // Stores ordering values, of the aggregator requirement corresponding to first value + // of the aggregator. These values are used during merging of multiple partitions. + orderings: Vec, + // Stores the applicable ordering requirement. + ordering_req: LexOrdering, + // Stores whether incoming data already satisfies the ordering requirement. + requirement_satisfied: bool, + // Ignore null values. + ignore_nulls: bool, +} + +impl FirstValueAccumulator { + /// Creates a new `FirstValueAccumulator` for the given `data_type`. + pub fn try_new( + data_type: &DataType, + ordering_dtypes: &[DataType], + ordering_req: LexOrdering, + ignore_nulls: bool, + ) -> Result { + let orderings = ordering_dtypes + .iter() + .map(ScalarValue::try_from) + .collect::>>()?; + let requirement_satisfied = ordering_req.is_empty(); + ScalarValue::try_from(data_type).map(|first| Self { + first, + is_set: false, + orderings, + ordering_req, + requirement_satisfied, + ignore_nulls, + }) + } + + // Updates state with the values in the given row. + fn update_with_new_row(&mut self, row: &[ScalarValue]) { + self.first = row[0].clone(); + self.orderings = row[1..].to_vec(); + self.is_set = true; + } + + fn get_first_idx(&self, values: &[ArrayRef]) -> Result> { + let [value, ordering_values @ ..] = values else { + return internal_err!("Empty row in FIRST_VALUE"); + }; + if self.requirement_satisfied { + // Get first entry according to the pre-existing ordering (0th index): + if self.ignore_nulls { + // If ignoring nulls, find the first non-null value. + for i in 0..value.len() { + if !value.is_null(i) { + return Ok(Some(i)); + } + } + return Ok(None); + } else { + // If not ignoring nulls, return the first value if it exists. + return Ok((!value.is_empty()).then_some(0)); + } + } + let sort_columns = ordering_values + .iter() + .zip(self.ordering_req.iter()) + .map(|(values, req)| SortColumn { + values: values.clone(), + options: Some(req.options), + }) + .collect::>(); + + if self.ignore_nulls { + let indices = lexsort_to_indices(&sort_columns, None)?; + // If ignoring nulls, find the first non-null value. + for index in indices.iter().flatten() { + if !value.is_null(index as usize) { + return Ok(Some(index as usize)); + } + } + Ok(None) + } else { + let indices = lexsort_to_indices(&sort_columns, Some(1))?; + Ok((!indices.is_empty()).then_some(indices.value(0) as _)) + } + } + + fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { + self.requirement_satisfied = requirement_satisfied; + self + } +} + +impl Accumulator for FirstValueAccumulator { + fn state(&mut self) -> Result> { + let mut result = vec![self.first.clone()]; + result.extend(self.orderings.iter().cloned()); + result.push(ScalarValue::Boolean(Some(self.is_set))); + Ok(result) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + if !self.is_set { + if let Some(first_idx) = self.get_first_idx(values)? { + let row = get_row_at_idx(values, first_idx)?; + self.update_with_new_row(&row); + } + } else if !self.requirement_satisfied { + if let Some(first_idx) = self.get_first_idx(values)? { + let row = get_row_at_idx(values, first_idx)?; + let orderings = &row[1..]; + if compare_rows( + &self.orderings, + orderings, + &get_sort_options(&self.ordering_req), + )? + .is_gt() + { + self.update_with_new_row(&row); + } + } + } + Ok(()) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + // FIRST_VALUE(first1, first2, first3, ...) + // last index contains is_set flag. + let is_set_idx = states.len() - 1; + let flags = states[is_set_idx].as_boolean(); + let filtered_states = filter_states_according_to_is_set(states, flags)?; + // 1..is_set_idx range corresponds to ordering section + let sort_cols = + convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req); + + let ordered_states = if sort_cols.is_empty() { + // When no ordering is given, use the existing state as is: + filtered_states + } else { + let indices = lexsort_to_indices(&sort_cols, None)?; + get_arrayref_at_indices(&filtered_states, &indices)? + }; + if !ordered_states[0].is_empty() { + let first_row = get_row_at_idx(&ordered_states, 0)?; + // When collecting orderings, we exclude the is_set flag from the state. + let first_ordering = &first_row[1..is_set_idx]; + let sort_options = get_sort_options(&self.ordering_req); + // Either there is no existing value, or there is an earlier version in new data. + if !self.is_set + || compare_rows(&self.orderings, first_ordering, &sort_options)?.is_gt() + { + // Update with first value in the state. Note that we should exclude the + // is_set flag from the state. Otherwise, we will end up with a state + // containing two is_set flags. + self.update_with_new_row(&first_row[0..is_set_idx]); + } + } + Ok(()) + } + + fn evaluate(&mut self) -> Result { + Ok(self.first.clone()) + } + + fn size(&self) -> usize { + std::mem::size_of_val(self) - std::mem::size_of_val(&self.first) + + self.first.size() + + ScalarValue::size_of_vec(&self.orderings) + - std::mem::size_of_val(&self.orderings) + } } \ No newline at end of file diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index a345b9b59727..1e308190d954 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -59,6 +59,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-aggregate-functions = { workspace = true } datafusion-physical-expr-common = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } diff --git a/datafusion/physical-expr/src/aggregate/first_last.rs b/datafusion/physical-expr/src/aggregate/first_last.rs index 26bd219f65f0..87f3c2bfcd9a 100644 --- a/datafusion/physical-expr/src/aggregate/first_last.rs +++ b/datafusion/physical-expr/src/aggregate/first_last.rs @@ -30,6 +30,7 @@ use arrow::array::{Array, ArrayRef, AsArray, BooleanArray}; use arrow::compute::{self, lexsort_to_indices, SortColumn}; use arrow::datatypes::{DataType, Field}; use arrow_schema::SortOptions; +use datafusion_aggregate_functions::first_last::FirstValueAccumulator; use datafusion_common::utils::{compare_rows, get_arrayref_at_indices, get_row_at_idx}; use datafusion_common::{ arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, @@ -218,224 +219,7 @@ impl PartialEq for FirstValue { } } -#[derive(Debug)] -struct FirstValueAccumulator { - first: ScalarValue, - // At the beginning, `is_set` is false, which means `first` is not seen yet. - // Once we see the first value, we set the `is_set` flag and do not update `first` anymore. - is_set: bool, - // Stores ordering values, of the aggregator requirement corresponding to first value - // of the aggregator. These values are used during merging of multiple partitions. - orderings: Vec, - // Stores the applicable ordering requirement. - ordering_req: LexOrdering, - // Stores whether incoming data already satisfies the ordering requirement. - requirement_satisfied: bool, - // Ignore null values. - ignore_nulls: bool, -} - -impl FirstValueAccumulator { - /// Creates a new `FirstValueAccumulator` for the given `data_type`. - pub fn try_new( - data_type: &DataType, - ordering_dtypes: &[DataType], - ordering_req: LexOrdering, - ignore_nulls: bool, - ) -> Result { - let orderings = ordering_dtypes - .iter() - .map(ScalarValue::try_from) - .collect::>>()?; - let requirement_satisfied = ordering_req.is_empty(); - ScalarValue::try_from(data_type).map(|first| Self { - first, - is_set: false, - orderings, - ordering_req, - requirement_satisfied, - ignore_nulls, - }) - } - - // Updates state with the values in the given row. - fn update_with_new_row(&mut self, row: &[ScalarValue]) { - self.first = row[0].clone(); - self.orderings = row[1..].to_vec(); - self.is_set = true; - } - fn get_first_idx(&self, values: &[ArrayRef]) -> Result> { - let [value, ordering_values @ ..] = values else { - return internal_err!("Empty row in FIRST_VALUE"); - }; - if self.requirement_satisfied { - // Get first entry according to the pre-existing ordering (0th index): - if self.ignore_nulls { - // If ignoring nulls, find the first non-null value. - for i in 0..value.len() { - if !value.is_null(i) { - return Ok(Some(i)); - } - } - return Ok(None); - } else { - // If not ignoring nulls, return the first value if it exists. - return Ok((!value.is_empty()).then_some(0)); - } - } - let sort_columns = ordering_values - .iter() - .zip(self.ordering_req.iter()) - .map(|(values, req)| SortColumn { - values: values.clone(), - options: Some(req.options), - }) - .collect::>(); - - if self.ignore_nulls { - let indices = lexsort_to_indices(&sort_columns, None)?; - // If ignoring nulls, find the first non-null value. - for index in indices.iter().flatten() { - if !value.is_null(index as usize) { - return Ok(Some(index as usize)); - } - } - Ok(None) - } else { - let indices = lexsort_to_indices(&sort_columns, Some(1))?; - Ok((!indices.is_empty()).then_some(indices.value(0) as _)) - } - } - - fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { - self.requirement_satisfied = requirement_satisfied; - self - } -} - -impl Accumulator for FirstValueAccumulator { - fn state(&mut self) -> Result> { - let mut result = vec![self.first.clone()]; - result.extend(self.orderings.iter().cloned()); - result.push(ScalarValue::Boolean(Some(self.is_set))); - Ok(result) - } - - fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - if !self.is_set { - if let Some(first_idx) = self.get_first_idx(values)? { - let row = get_row_at_idx(values, first_idx)?; - self.update_with_new_row(&row); - } - } else if !self.requirement_satisfied { - if let Some(first_idx) = self.get_first_idx(values)? { - let row = get_row_at_idx(values, first_idx)?; - let orderings = &row[1..]; - if compare_rows( - &self.orderings, - orderings, - &get_sort_options(&self.ordering_req), - )? - .is_gt() - { - self.update_with_new_row(&row); - } - } - } - Ok(()) - } - - fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { - // FIRST_VALUE(first1, first2, first3, ...) - // last index contains is_set flag. - let is_set_idx = states.len() - 1; - let flags = states[is_set_idx].as_boolean(); - let filtered_states = filter_states_according_to_is_set(states, flags)?; - // 1..is_set_idx range corresponds to ordering section - let sort_cols = - convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req); - - let ordered_states = if sort_cols.is_empty() { - // When no ordering is given, use the existing state as is: - filtered_states - } else { - let indices = lexsort_to_indices(&sort_cols, None)?; - get_arrayref_at_indices(&filtered_states, &indices)? - }; - if !ordered_states[0].is_empty() { - let first_row = get_row_at_idx(&ordered_states, 0)?; - // When collecting orderings, we exclude the is_set flag from the state. - let first_ordering = &first_row[1..is_set_idx]; - let sort_options = get_sort_options(&self.ordering_req); - // Either there is no existing value, or there is an earlier version in new data. - if !self.is_set - || compare_rows(&self.orderings, first_ordering, &sort_options)?.is_gt() - { - // Update with first value in the state. Note that we should exclude the - // is_set flag from the state. Otherwise, we will end up with a state - // containing two is_set flags. - self.update_with_new_row(&first_row[0..is_set_idx]); - } - } - Ok(()) - } - - fn evaluate(&mut self) -> Result { - Ok(self.first.clone()) - } - - fn size(&self) -> usize { - std::mem::size_of_val(self) - std::mem::size_of_val(&self.first) - + self.first.size() - + ScalarValue::size_of_vec(&self.orderings) - - std::mem::size_of_val(&self.orderings) - } -} - -pub fn create_first_value_accumulator( - acc_args: AccumulatorArgs, -) -> Result> { - let mut all_sort_orders = vec![]; - - // Construct PhysicalSortExpr objects from Expr objects: - let mut sort_exprs = vec![]; - for expr in acc_args.sort_exprs { - if let Expr::Sort(sort) = expr { - if let Expr::Column(col) = sort.expr.as_ref() { - let name = &col.name; - let e = expressions::col(name, acc_args.schema)?; - sort_exprs.push(PhysicalSortExpr { - expr: e, - options: SortOptions { - descending: !sort.asc, - nulls_first: sort.nulls_first, - }, - }); - } - } - } - if !sort_exprs.is_empty() { - all_sort_orders.extend(sort_exprs); - } - - let ordering_req = all_sort_orders; - - let ordering_dtypes = ordering_req - .iter() - .map(|e| e.expr.data_type(acc_args.schema)) - .collect::>>()?; - - let requirement_satisfied = ordering_req.is_empty(); - - FirstValueAccumulator::try_new( - acc_args.data_type, - &ordering_dtypes, - ordering_req, - acc_args.ignore_nulls, - ) - .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _) -} /// LAST_VALUE aggregate expression #[derive(Debug, Clone)] From 30d5576769e8f19826a4b77cb66840cb6bb4e7ea Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 20:16:55 +0800 Subject: [PATCH 18/38] move acc to first value Signed-off-by: jayzhan211 --- Cargo.toml | 2 +- datafusion/aggregate-functions/Cargo.toml | 4 +- .../aggregate-functions/src/first_last.rs | 100 ++++++++++++------ datafusion/aggregate-functions/src/lib.rs | 7 +- datafusion/aggregate-functions/src/macros.rs | 2 +- datafusion/common/src/utils.rs | 5 - datafusion/core/Cargo.toml | 1 + datafusion/core/src/execution/context/mod.rs | 21 +--- datafusion/expr/src/expr_fn.rs | 2 +- datafusion/expr/src/udaf.rs | 7 +- datafusion/expr/src/utils.rs | 6 ++ datafusion/physical-expr/Cargo.toml | 2 +- .../physical-expr/src/aggregate/first_last.rs | 47 ++------ datafusion/physical-expr/src/lib.rs | 2 - 14 files changed, 98 insertions(+), 110 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bbf7897194f6..1864afbef81b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,11 +74,11 @@ chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.4.0" datafusion = { path = "datafusion/core", version = "37.0.0", default-features = false } +datafusion-aggregate-functions = { path = "datafusion/aggregate-functions", version = "37.0.0" } datafusion-common = { path = "datafusion/common", version = "37.0.0", default-features = false } datafusion-common-runtime = { path = "datafusion/common-runtime", version = "37.0.0" } datafusion-execution = { path = "datafusion/execution", version = "37.0.0" } datafusion-expr = { path = "datafusion/expr", version = "37.0.0" } -datafusion-aggregate-functions = { path = "datafusion/aggregate-functions", version = "37.0.0" } datafusion-functions = { path = "datafusion/functions", version = "37.0.0" } datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" } datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false } diff --git a/datafusion/aggregate-functions/Cargo.toml b/datafusion/aggregate-functions/Cargo.toml index 6330da97f69b..cd20b37ad922 100644 --- a/datafusion/aggregate-functions/Cargo.toml +++ b/datafusion/aggregate-functions/Cargo.toml @@ -38,4 +38,6 @@ path = "src/lib.rs" arrow = { workspace = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } -datafusion-expr = { workspace = true } \ No newline at end of file +datafusion-expr = { workspace = true } +datafusion-physical-expr-common = { workspace = true } +log = { workspace = true } diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs index 47237e5dd61e..dcc82ec5c0cb 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -1,4 +1,3 @@ - // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -16,15 +15,27 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::{ArrayRef, AsArray, BooleanArray}; +use arrow::compute::{self, lexsort_to_indices, SortColumn, SortOptions}; use arrow::datatypes::{DataType, Field}; -use datafusion_common::utils::format_state_name; -use datafusion_common::Result; +use datafusion_common::utils::{compare_rows, get_arrayref_at_indices, get_row_at_idx}; +use datafusion_common::{ + arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, +}; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::type_coercion::aggregates::NUMERICS; -use datafusion_expr::{Accumulator, AccumulatorFactoryFunction, AggregateUDF, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::utils::format_state_name; +use datafusion_expr::{ + Accumulator, AccumulatorFactoryFunction, AggregateUDFImpl, Expr, + Signature, Volatility, +}; +use datafusion_physical_expr_common::aggregate::utils::get_sort_options; +use datafusion_physical_expr_common::expressions; +use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use std::any::Any; use std::fmt::Debug; +// TODO: macro udaf // make_udf_function!( // FirstValue, // first_value, @@ -33,7 +44,6 @@ use std::fmt::Debug; // first_value_fn // ); - pub struct FirstValue { signature: Signature, aliases: Vec, @@ -51,13 +61,9 @@ impl Debug for FirstValue { } impl FirstValue { - pub fn new( - accumulator: AccumulatorFactoryFunction, - ) -> Self { + pub fn new(accumulator: AccumulatorFactoryFunction) -> Self { Self { - aliases: vec![ - String::from("FIRST_VALUE"), - ], + aliases: vec![String::from("FIRST_VALUE")], signature: Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable), accumulator, } @@ -81,10 +87,7 @@ impl AggregateUDFImpl for FirstValue { Ok(arg_types[0].clone()) } - fn accumulator( - &self, - acc_args: AccumulatorArgs, - ) -> Result> { + fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { (self.accumulator)(acc_args) } @@ -103,20 +106,21 @@ impl AggregateUDFImpl for FirstValue { fields.push(Field::new("is_set", DataType::Boolean, true)); Ok(fields) } -} -/// Creates a new UDAF with a specific signature, state type and return type. -/// The signature and state type must match the `Accumulator's implementation`. -/// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then. -pub fn create_first_value( - name: &str, - signature: Signature, - accumulator: AccumulatorFactoryFunction, -) -> AggregateUDF { - AggregateUDF::from(FirstValue::new(accumulator)) + fn aliases(&self) -> &[String] { + &self.aliases + } } -pub fn create_first_value_accumulator( +// /// Creates a new UDAF with a specific signature, state type and return type. +// /// The signature and state type must match the `Accumulator's implementation`. +// /// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then. +// pub fn create_first_value() -> AggregateUDF { +// let accumulator = Arc::new(create_first_value_accumulator); +// AggregateUDF::from(FirstValue::new(accumulator)) +// } + +pub(crate) fn create_first_value_accumulator( acc_args: AccumulatorArgs, ) -> Result> { let mut all_sort_orders = vec![]; @@ -127,7 +131,7 @@ pub fn create_first_value_accumulator( if let Expr::Sort(sort) = expr { if let Expr::Column(col) = sort.expr.as_ref() { let name = &col.name; - let e = expressions::col(name, acc_args.schema)?; + let e = expressions::column::col(name, acc_args.schema)?; sort_exprs.push(PhysicalSortExpr { expr: e, options: SortOptions { @@ -200,6 +204,11 @@ impl FirstValueAccumulator { }) } + pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { + self.requirement_satisfied = requirement_satisfied; + self + } + // Updates state with the values in the given row. fn update_with_new_row(&mut self, row: &[ScalarValue]) { self.first = row[0].clone(); @@ -249,11 +258,6 @@ impl FirstValueAccumulator { Ok((!indices.is_empty()).then_some(indices.value(0) as _)) } } - - fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { - self.requirement_satisfied = requirement_satisfied; - self - } } impl Accumulator for FirstValueAccumulator { @@ -333,4 +337,34 @@ impl Accumulator for FirstValueAccumulator { + ScalarValue::size_of_vec(&self.orderings) - std::mem::size_of_val(&self.orderings) } -} \ No newline at end of file +} + +/// Filters states according to the `is_set` flag at the last column and returns +/// the resulting states. +/// +/// TODO: This function can be private once the `LAST_VALUE` function is moved to the `aggregate-functions` crate. +pub fn filter_states_according_to_is_set( + states: &[ArrayRef], + flags: &BooleanArray, +) -> Result> { + states + .iter() + .map(|state| compute::filter(state, flags).map_err(|e| arrow_datafusion_err!(e))) + .collect::>>() +} + +/// Combines array refs and their corresponding orderings to construct `SortColumn`s. +/// +/// TODO: This function can be private once the `LAST_VALUE` function is moved to the `aggregate-functions` crate. +pub fn convert_to_sort_cols( + arrs: &[ArrayRef], + sort_exprs: &[PhysicalSortExpr], +) -> Vec { + arrs.iter() + .zip(sort_exprs.iter()) + .map(|(item, sort_expr)| SortColumn { + values: item.clone(), + options: Some(sort_expr.options), + }) + .collect::>() +} diff --git a/datafusion/aggregate-functions/src/lib.rs b/datafusion/aggregate-functions/src/lib.rs index 6435022eb60b..15d17e0805d8 100644 --- a/datafusion/aggregate-functions/src/lib.rs +++ b/datafusion/aggregate-functions/src/lib.rs @@ -82,6 +82,7 @@ use std::sync::Arc; use datafusion_common::Result; use datafusion_execution::FunctionRegistry; use datafusion_expr::AggregateUDF; +use first_last::{create_first_value_accumulator, FirstValue}; use log::debug; pub mod first_last; @@ -95,7 +96,11 @@ pub mod expr_fn {} /// Registers all enabled packages with a [`FunctionRegistry`] pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { - let mut functions: Vec> = vec![]; + // TODO: macro this creation + let accumulator = Arc::new(create_first_value_accumulator); + let first_value = AggregateUDF::from(FirstValue::new(accumulator)); + + let functions: Vec> = vec![first_value.into()]; functions.into_iter().try_for_each(|udf| { let existing_udaf = registry.register_udaf(udf)?; diff --git a/datafusion/aggregate-functions/src/macros.rs b/datafusion/aggregate-functions/src/macros.rs index dc7d4a3babcb..000d9d4200cc 100644 --- a/datafusion/aggregate-functions/src/macros.rs +++ b/datafusion/aggregate-functions/src/macros.rs @@ -47,4 +47,4 @@ macro_rules! make_udf_function { } } } -} \ No newline at end of file +} diff --git a/datafusion/common/src/utils.rs b/datafusion/common/src/utils.rs index e09684e36524..3296e68d17da 100644 --- a/datafusion/common/src/utils.rs +++ b/datafusion/common/src/utils.rs @@ -679,11 +679,6 @@ pub fn find_indices>( .ok_or_else(|| DataFusionError::Execution("Target not found".to_string())) } -/// Construct state name. State is the intermidiate state of the aggregate function. -pub fn format_state_name(name: &str, state_name: &str) -> String { - format!("{name}[{state_name}]") -} - #[cfg(test)] mod tests { use crate::ScalarValue; diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 610784f91dec..2e45fb8f4905 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -93,6 +93,7 @@ bytes = { workspace = true } bzip2 = { version = "0.4.3", optional = true } chrono = { workspace = true } dashmap = { workspace = true } +datafusion-aggregate-functions = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } datafusion-common-runtime = { workspace = true } datafusion-execution = { workspace = true } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 7f267d585e32..79eb3caacc77 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -69,14 +69,11 @@ use datafusion_common::{ SchemaReference, TableReference, }; use datafusion_execution::registry::SerializerRegistry; -use datafusion_expr::type_coercion::aggregates::NUMERICS; -use datafusion_expr::{Signature, Volatility}; use datafusion_expr::{ logical_plan::{DdlStatement, Statement}, var_provider::is_system_variables, Expr, StringifiedPlan, UserDefinedLogicalNode, WindowUDF, }; -use datafusion_physical_expr::create_first_value_accumulator; use datafusion_sql::{ parser::{CopyToSource, CopyToStatement, DFParser}, planner::{object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel}, @@ -85,7 +82,6 @@ use datafusion_sql::{ use async_trait::async_trait; use chrono::{DateTime, Utc}; -use log::debug; use parking_lot::RwLock; use sqlparser::dialect::dialect_from_str; use url::Url; @@ -1460,21 +1456,8 @@ impl SessionState { datafusion_functions_array::register_all(&mut new_self) .expect("can not register array expressions"); - let first_value = create_first_value( - "FIRST_VALUE", - Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable), - Arc::new(create_first_value_accumulator), - ); - - match new_self.register_udaf(Arc::new(first_value)) { - Ok(Some(existing_udaf)) => { - debug!("Overwrite existing UDAF: {}", existing_udaf.name()); - } - Ok(None) => {} - Err(err) => { - panic!("Failed to register UDAF: {}", err); - } - } + datafusion_aggregate_functions::register_all(&mut new_self) + .expect("can not register aggregate functions"); new_self } diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index b666a38e6b04..cb454b4e6cad 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -24,7 +24,7 @@ use crate::expr::{ use crate::function::{ AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory, }; -use crate::udaf::format_state_name; +use crate::utils::format_state_name; use crate::{ aggregate_function, built_in_function, conditional_expressions::CaseBuilder, logical_plan::Subquery, AggregateUDF, BuiltinScalarFunction, Expr, LogicalPlan, diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 3cf1845aacd6..856f0dc44246 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -19,6 +19,7 @@ use crate::function::AccumulatorArgs; use crate::groups_accumulator::GroupsAccumulator; +use crate::utils::format_state_name; use crate::{Accumulator, Expr}; use crate::{AccumulatorFactoryFunction, ReturnTypeFunction, Signature}; use arrow::datatypes::{DataType, Field}; @@ -447,9 +448,3 @@ impl AggregateUDFImpl for AggregateUDFLegacyWrapper { (self.accumulator)(acc_args) } } - -/// returns the name of the state -/// TODO: Remove duplicated function in physical-expr -pub(crate) fn format_state_name(name: &str, state_name: &str) -> String { - format!("{name}[{state_name}]") -} diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 72d01da20448..8add56dd349f 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -1240,6 +1240,12 @@ pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema { } } +/// Construct state name. State is the intermidiate state of the aggregate function. +/// TODO: Remove duplicated function in physical-expr +pub fn format_state_name(name: &str, state_name: &str) -> String { + format!("{name}[{state_name}]") +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index dfcbd9df7c29..352e7e44a9cd 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -56,10 +56,10 @@ base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { workspace = true } +datafusion-aggregate-functions = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-aggregate-functions = { workspace = true } datafusion-physical-expr-common = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } diff --git a/datafusion/physical-expr/src/aggregate/first_last.rs b/datafusion/physical-expr/src/aggregate/first_last.rs index 87f3c2bfcd9a..0b0b9f398be1 100644 --- a/datafusion/physical-expr/src/aggregate/first_last.rs +++ b/datafusion/physical-expr/src/aggregate/first_last.rs @@ -21,22 +21,20 @@ use std::any::Any; use std::sync::Arc; use crate::aggregate::utils::{down_cast_any_ref, get_sort_options, ordering_fields}; -use crate::expressions::{self, format_state_name}; +use crate::expressions::format_state_name; use crate::{ reverse_order_bys, AggregateExpr, LexOrdering, PhysicalExpr, PhysicalSortExpr, }; -use arrow::array::{Array, ArrayRef, AsArray, BooleanArray}; -use arrow::compute::{self, lexsort_to_indices, SortColumn}; +use arrow::array::{Array, ArrayRef, AsArray}; +use arrow::compute::{lexsort_to_indices, SortColumn}; use arrow::datatypes::{DataType, Field}; -use arrow_schema::SortOptions; -use datafusion_aggregate_functions::first_last::FirstValueAccumulator; -use datafusion_common::utils::{compare_rows, get_arrayref_at_indices, get_row_at_idx}; -use datafusion_common::{ - arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, +use datafusion_aggregate_functions::first_last::{ + convert_to_sort_cols, filter_states_according_to_is_set, FirstValueAccumulator, }; -use datafusion_expr::function::AccumulatorArgs; -use datafusion_expr::{Accumulator, Expr}; +use datafusion_common::utils::{compare_rows, get_arrayref_at_indices, get_row_at_idx}; +use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_expr::Accumulator; /// FIRST_VALUE aggregate expression #[derive(Debug, Clone)] @@ -218,9 +216,6 @@ impl PartialEq for FirstValue { .unwrap_or(false) } } - - - /// LAST_VALUE aggregate expression #[derive(Debug, Clone)] pub struct LastValue { @@ -576,32 +571,6 @@ impl Accumulator for LastValueAccumulator { } } -/// Filters states according to the `is_set` flag at the last column and returns -/// the resulting states. -fn filter_states_according_to_is_set( - states: &[ArrayRef], - flags: &BooleanArray, -) -> Result> { - states - .iter() - .map(|state| compute::filter(state, flags).map_err(|e| arrow_datafusion_err!(e))) - .collect::>>() -} - -/// Combines array refs and their corresponding orderings to construct `SortColumn`s. -fn convert_to_sort_cols( - arrs: &[ArrayRef], - sort_exprs: &[PhysicalSortExpr], -) -> Vec { - arrs.iter() - .zip(sort_exprs.iter()) - .map(|(item, sort_expr)| SortColumn { - values: item.clone(), - options: Some(sort_expr.options), - }) - .collect::>() -} - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index c88f1b32bbc6..7b81e8f8a5c4 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -61,8 +61,6 @@ pub use scalar_function::ScalarFunctionExpr; pub use datafusion_physical_expr_common::utils::reverse_order_bys; pub use utils::split_conjunction; -pub use aggregate::first_last::create_first_value_accumulator; - // For backwards compatibility pub mod sort_properties { pub use datafusion_physical_expr_common::sort_properties::{ From 672edc7c9f0e5f2bd45b231493fd9f964fa0cd6a Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 20:39:31 +0800 Subject: [PATCH 19/38] move builtin expr too Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 14 + .../aggregate-functions/src/first_last.rs | 661 ++++++++++++++++- datafusion/aggregate-functions/src/macros.rs | 62 +- .../physical-expr/src/aggregate/first_last.rs | 689 ------------------ datafusion/physical-expr/src/aggregate/mod.rs | 1 - .../physical-expr/src/expressions/mod.rs | 4 +- .../physical-plan/src/aggregates/mod.rs | 2 +- 7 files changed, 707 insertions(+), 726 deletions(-) delete mode 100644 datafusion/physical-expr/src/aggregate/first_last.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index d744a891c6a6..6f5f035ed300 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1130,6 +1130,7 @@ dependencies = [ "bzip2", "chrono", "dashmap", + "datafusion-aggregate-functions", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", @@ -1165,6 +1166,18 @@ dependencies = [ "zstd 0.13.0", ] +[[package]] +name = "datafusion-aggregate-functions" +version = "37.0.0" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "log", +] + [[package]] name = "datafusion-cli" version = "37.0.0" @@ -1328,6 +1341,7 @@ dependencies = [ "blake2", "blake3", "chrono", + "datafusion-aggregate-functions", "datafusion-common", "datafusion-execution", "datafusion-expr", diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs index dcc82ec5c0cb..5b2c2ff62e61 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Defines the FIRST_VALUE/LAST_VALUE aggregations. + use arrow::array::{ArrayRef, AsArray, BooleanArray}; use arrow::compute::{self, lexsort_to_indices, SortColumn, SortOptions}; use arrow::datatypes::{DataType, Field}; @@ -26,14 +28,20 @@ use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AccumulatorFactoryFunction, AggregateUDFImpl, Expr, - Signature, Volatility, + Accumulator, AccumulatorFactoryFunction, AggregateUDFImpl, Expr, Signature, + Volatility, +}; +use datafusion_physical_expr_common::aggregate::utils::{ + down_cast_any_ref, get_sort_options, ordering_fields, }; -use datafusion_physical_expr_common::aggregate::utils::get_sort_options; +use datafusion_physical_expr_common::aggregate::AggregateExpr; use datafusion_physical_expr_common::expressions; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; +use datafusion_physical_expr_common::utils::reverse_order_bys; use std::any::Any; use std::fmt::Debug; +use std::sync::Arc; // TODO: macro udaf // make_udf_function!( @@ -339,6 +347,542 @@ impl Accumulator for FirstValueAccumulator { } } +/// TO BE DEPRECATED: Builtin FIRST_VALUE physical aggregate expression +#[derive(Debug, Clone)] +pub struct FirstValuePhysicalExpr { + name: String, + input_data_type: DataType, + order_by_data_types: Vec, + expr: Arc, + ordering_req: LexOrdering, + requirement_satisfied: bool, + ignore_nulls: bool, + state_fields: Vec, +} + +impl FirstValuePhysicalExpr { + /// Creates a new FIRST_VALUE aggregation function. + pub fn new( + expr: Arc, + name: impl Into, + input_data_type: DataType, + ordering_req: LexOrdering, + order_by_data_types: Vec, + state_fields: Vec, + ) -> Self { + let requirement_satisfied = ordering_req.is_empty(); + Self { + name: name.into(), + input_data_type, + order_by_data_types, + expr, + ordering_req, + requirement_satisfied, + ignore_nulls: false, + state_fields, + } + } + + pub fn with_ignore_nulls(mut self, ignore_nulls: bool) -> Self { + self.ignore_nulls = ignore_nulls; + self + } + + /// Returns the name of the aggregate expression. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns the input data type of the aggregate expression. + pub fn input_data_type(&self) -> &DataType { + &self.input_data_type + } + + /// Returns the data types of the order-by columns. + pub fn order_by_data_types(&self) -> &Vec { + &self.order_by_data_types + } + + /// Returns the expression associated with the aggregate function. + pub fn expr(&self) -> &Arc { + &self.expr + } + + /// Returns the lexical ordering requirements of the aggregate expression. + pub fn ordering_req(&self) -> &LexOrdering { + &self.ordering_req + } + + pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { + self.requirement_satisfied = requirement_satisfied; + self + } + + pub fn convert_to_last(self) -> LastValuePhysicalExpr { + let name = if self.name.starts_with("FIRST") { + format!("LAST{}", &self.name[5..]) + } else { + format!("LAST_VALUE({})", self.expr) + }; + let FirstValuePhysicalExpr { + expr, + input_data_type, + ordering_req, + order_by_data_types, + .. + } = self; + LastValuePhysicalExpr::new( + expr, + name, + input_data_type, + reverse_order_bys(&ordering_req), + order_by_data_types, + ) + } +} + +impl AggregateExpr for FirstValuePhysicalExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + Ok(Field::new(&self.name, self.input_data_type.clone(), true)) + } + + fn create_accumulator(&self) -> Result> { + FirstValueAccumulator::try_new( + &self.input_data_type, + &self.order_by_data_types, + self.ordering_req.clone(), + self.ignore_nulls, + ) + .map(|acc| { + Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ + }) + } + + fn state_fields(&self) -> Result> { + if !self.state_fields.is_empty() { + return Ok(self.state_fields.clone()); + } + + let mut fields = vec![Field::new( + format_state_name(&self.name, "first_value"), + self.input_data_type.clone(), + true, + )]; + fields.extend(ordering_fields( + &self.ordering_req, + &self.order_by_data_types, + )); + fields.push(Field::new( + format_state_name(&self.name, "is_set"), + DataType::Boolean, + true, + )); + Ok(fields) + } + + fn expressions(&self) -> Vec> { + vec![self.expr.clone()] + } + + fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + (!self.ordering_req.is_empty()).then_some(&self.ordering_req) + } + + fn name(&self) -> &str { + &self.name + } + + fn reverse_expr(&self) -> Option> { + Some(Arc::new(self.clone().convert_to_last())) + } + + fn create_sliding_accumulator(&self) -> Result> { + FirstValueAccumulator::try_new( + &self.input_data_type, + &self.order_by_data_types, + self.ordering_req.clone(), + self.ignore_nulls, + ) + .map(|acc| { + Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ + }) + } +} + +impl PartialEq for FirstValuePhysicalExpr { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| { + self.name == x.name + && self.input_data_type == x.input_data_type + && self.order_by_data_types == x.order_by_data_types + && self.expr.eq(&x.expr) + }) + .unwrap_or(false) + } +} + +/// TO BE DEPRECATED: Builtin LAST_VALUE physical aggregate expression +#[derive(Debug, Clone)] +pub struct LastValuePhysicalExpr { + name: String, + input_data_type: DataType, + order_by_data_types: Vec, + expr: Arc, + ordering_req: LexOrdering, + requirement_satisfied: bool, + ignore_nulls: bool, +} + +impl LastValuePhysicalExpr { + /// Creates a new LAST_VALUE aggregation function. + pub fn new( + expr: Arc, + name: impl Into, + input_data_type: DataType, + ordering_req: LexOrdering, + order_by_data_types: Vec, + ) -> Self { + let requirement_satisfied = ordering_req.is_empty(); + Self { + name: name.into(), + input_data_type, + order_by_data_types, + expr, + ordering_req, + requirement_satisfied, + ignore_nulls: false, + } + } + + pub fn with_ignore_nulls(mut self, ignore_nulls: bool) -> Self { + self.ignore_nulls = ignore_nulls; + self + } + + /// Returns the name of the aggregate expression. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns the input data type of the aggregate expression. + pub fn input_data_type(&self) -> &DataType { + &self.input_data_type + } + + /// Returns the data types of the order-by columns. + pub fn order_by_data_types(&self) -> &Vec { + &self.order_by_data_types + } + + /// Returns the expression associated with the aggregate function. + pub fn expr(&self) -> &Arc { + &self.expr + } + + /// Returns the lexical ordering requirements of the aggregate expression. + pub fn ordering_req(&self) -> &LexOrdering { + &self.ordering_req + } + + pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { + self.requirement_satisfied = requirement_satisfied; + self + } + + pub fn convert_to_first(self) -> FirstValuePhysicalExpr { + let name = if self.name.starts_with("LAST") { + format!("FIRST{}", &self.name[4..]) + } else { + format!("FIRST_VALUE({})", self.expr) + }; + let LastValuePhysicalExpr { + expr, + input_data_type, + ordering_req, + order_by_data_types, + .. + } = self; + FirstValuePhysicalExpr::new( + expr, + name, + input_data_type, + reverse_order_bys(&ordering_req), + order_by_data_types, + vec![], + ) + } +} + +impl AggregateExpr for LastValuePhysicalExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + Ok(Field::new(&self.name, self.input_data_type.clone(), true)) + } + + fn create_accumulator(&self) -> Result> { + LastValueAccumulator::try_new( + &self.input_data_type, + &self.order_by_data_types, + self.ordering_req.clone(), + self.ignore_nulls, + ) + .map(|acc| { + Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ + }) + } + + fn state_fields(&self) -> Result> { + let mut fields = vec![Field::new( + format_state_name(&self.name, "last_value"), + self.input_data_type.clone(), + true, + )]; + fields.extend(ordering_fields( + &self.ordering_req, + &self.order_by_data_types, + )); + fields.push(Field::new( + format_state_name(&self.name, "is_set"), + DataType::Boolean, + true, + )); + Ok(fields) + } + + fn expressions(&self) -> Vec> { + vec![self.expr.clone()] + } + + fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + (!self.ordering_req.is_empty()).then_some(&self.ordering_req) + } + + fn name(&self) -> &str { + &self.name + } + + fn reverse_expr(&self) -> Option> { + Some(Arc::new(self.clone().convert_to_first())) + } + + fn create_sliding_accumulator(&self) -> Result> { + LastValueAccumulator::try_new( + &self.input_data_type, + &self.order_by_data_types, + self.ordering_req.clone(), + self.ignore_nulls, + ) + .map(|acc| { + Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ + }) + } +} + +impl PartialEq for LastValuePhysicalExpr { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| { + self.name == x.name + && self.input_data_type == x.input_data_type + && self.order_by_data_types == x.order_by_data_types + && self.expr.eq(&x.expr) + }) + .unwrap_or(false) + } +} + +#[derive(Debug)] +struct LastValueAccumulator { + last: ScalarValue, + // The `is_set` flag keeps track of whether the last value is finalized. + // This information is used to discriminate genuine NULLs and NULLS that + // occur due to empty partitions. + is_set: bool, + orderings: Vec, + // Stores the applicable ordering requirement. + ordering_req: LexOrdering, + // Stores whether incoming data already satisfies the ordering requirement. + requirement_satisfied: bool, + // Ignore null values. + ignore_nulls: bool, +} + +impl LastValueAccumulator { + /// Creates a new `LastValueAccumulator` for the given `data_type`. + pub fn try_new( + data_type: &DataType, + ordering_dtypes: &[DataType], + ordering_req: LexOrdering, + ignore_nulls: bool, + ) -> Result { + let orderings = ordering_dtypes + .iter() + .map(ScalarValue::try_from) + .collect::>>()?; + let requirement_satisfied = ordering_req.is_empty(); + ScalarValue::try_from(data_type).map(|last| Self { + last, + is_set: false, + orderings, + ordering_req, + requirement_satisfied, + ignore_nulls, + }) + } + + // Updates state with the values in the given row. + fn update_with_new_row(&mut self, row: &[ScalarValue]) { + self.last = row[0].clone(); + self.orderings = row[1..].to_vec(); + self.is_set = true; + } + + fn get_last_idx(&self, values: &[ArrayRef]) -> Result> { + let [value, ordering_values @ ..] = values else { + return internal_err!("Empty row in LAST_VALUE"); + }; + if self.requirement_satisfied { + // Get last entry according to the order of data: + if self.ignore_nulls { + // If ignoring nulls, find the last non-null value. + for i in (0..value.len()).rev() { + if !value.is_null(i) { + return Ok(Some(i)); + } + } + return Ok(None); + } else { + return Ok((!value.is_empty()).then_some(value.len() - 1)); + } + } + let sort_columns = ordering_values + .iter() + .zip(self.ordering_req.iter()) + .map(|(values, req)| { + // Take the reverse ordering requirement. This enables us to + // use "fetch = 1" to get the last value. + SortColumn { + values: values.clone(), + options: Some(!req.options), + } + }) + .collect::>(); + + if self.ignore_nulls { + let indices = lexsort_to_indices(&sort_columns, None)?; + // If ignoring nulls, find the last non-null value. + for index in indices.iter().flatten() { + if !value.is_null(index as usize) { + return Ok(Some(index as usize)); + } + } + Ok(None) + } else { + let indices = lexsort_to_indices(&sort_columns, Some(1))?; + Ok((!indices.is_empty()).then_some(indices.value(0) as _)) + } + } + + fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { + self.requirement_satisfied = requirement_satisfied; + self + } +} + +impl Accumulator for LastValueAccumulator { + fn state(&mut self) -> Result> { + let mut result = vec![self.last.clone()]; + result.extend(self.orderings.clone()); + result.push(ScalarValue::Boolean(Some(self.is_set))); + Ok(result) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + if !self.is_set || self.requirement_satisfied { + if let Some(last_idx) = self.get_last_idx(values)? { + let row = get_row_at_idx(values, last_idx)?; + self.update_with_new_row(&row); + } + } else if let Some(last_idx) = self.get_last_idx(values)? { + let row = get_row_at_idx(values, last_idx)?; + let orderings = &row[1..]; + // Update when there is a more recent entry + if compare_rows( + &self.orderings, + orderings, + &get_sort_options(&self.ordering_req), + )? + .is_lt() + { + self.update_with_new_row(&row); + } + } + + Ok(()) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + // LAST_VALUE(last1, last2, last3, ...) + // last index contains is_set flag. + let is_set_idx = states.len() - 1; + let flags = states[is_set_idx].as_boolean(); + let filtered_states = filter_states_according_to_is_set(states, flags)?; + // 1..is_set_idx range corresponds to ordering section + let sort_cols = + convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req); + + let ordered_states = if sort_cols.is_empty() { + // When no ordering is given, use existing state as is: + filtered_states + } else { + let indices = lexsort_to_indices(&sort_cols, None)?; + get_arrayref_at_indices(&filtered_states, &indices)? + }; + + if !ordered_states[0].is_empty() { + let last_idx = ordered_states[0].len() - 1; + let last_row = get_row_at_idx(&ordered_states, last_idx)?; + // When collecting orderings, we exclude the is_set flag from the state. + let last_ordering = &last_row[1..is_set_idx]; + let sort_options = get_sort_options(&self.ordering_req); + // Either there is no existing value, or there is a newer (latest) + // version in the new data: + if !self.is_set + || compare_rows(&self.orderings, last_ordering, &sort_options)?.is_lt() + { + // Update with last value in the state. Note that we should exclude the + // is_set flag from the state. Otherwise, we will end up with a state + // containing two is_set flags. + self.update_with_new_row(&last_row[0..is_set_idx]); + } + } + Ok(()) + } + + fn evaluate(&mut self) -> Result { + Ok(self.last.clone()) + } + + fn size(&self) -> usize { + std::mem::size_of_val(self) - std::mem::size_of_val(&self.last) + + self.last.size() + + ScalarValue::size_of_vec(&self.orderings) + - std::mem::size_of_val(&self.orderings) + } +} + /// Filters states according to the `is_set` flag at the last column and returns /// the resulting states. /// @@ -368,3 +912,114 @@ pub fn convert_to_sort_cols( }) .collect::>() } + +#[cfg(test)] +mod tests { + use arrow::array::Int64Array; + + use super::*; + + #[test] + fn test_first_last_value_value() -> Result<()> { + let mut first_accumulator = + FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + let mut last_accumulator = + LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + // first value in the tuple is start of the range (inclusive), + // second value in the tuple is end of the range (exclusive) + let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)]; + // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12 + let arrs = ranges + .into_iter() + .map(|(start, end)| { + Arc::new(Int64Array::from((start..end).collect::>())) as ArrayRef + }) + .collect::>(); + for arr in arrs { + // Once first_value is set, accumulator should remember it. + // It shouldn't update first_value for each new batch + first_accumulator.update_batch(&[arr.clone()])?; + // last_value should be updated for each new batch. + last_accumulator.update_batch(&[arr])?; + } + // First Value comes from the first value of the first batch which is 0 + assert_eq!(first_accumulator.evaluate()?, ScalarValue::Int64(Some(0))); + // Last value comes from the last value of the last batch which is 12 + assert_eq!(last_accumulator.evaluate()?, ScalarValue::Int64(Some(12))); + Ok(()) + } + + #[test] + fn test_first_last_state_after_merge() -> Result<()> { + let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)]; + // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12 + let arrs = ranges + .into_iter() + .map(|(start, end)| { + Arc::new((start..end).collect::()) as ArrayRef + }) + .collect::>(); + + // FirstValueAccumulator + let mut first_accumulator = + FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + + first_accumulator.update_batch(&[arrs[0].clone()])?; + let state1 = first_accumulator.state()?; + + let mut first_accumulator = + FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + first_accumulator.update_batch(&[arrs[1].clone()])?; + let state2 = first_accumulator.state()?; + + assert_eq!(state1.len(), state2.len()); + + let mut states = vec![]; + + for idx in 0..state1.len() { + states.push(arrow::compute::concat(&[ + &state1[idx].to_array()?, + &state2[idx].to_array()?, + ])?); + } + + let mut first_accumulator = + FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + first_accumulator.merge_batch(&states)?; + + let merged_state = first_accumulator.state()?; + assert_eq!(merged_state.len(), state1.len()); + + // LastValueAccumulator + let mut last_accumulator = + LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + + last_accumulator.update_batch(&[arrs[0].clone()])?; + let state1 = last_accumulator.state()?; + + let mut last_accumulator = + LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + last_accumulator.update_batch(&[arrs[1].clone()])?; + let state2 = last_accumulator.state()?; + + assert_eq!(state1.len(), state2.len()); + + let mut states = vec![]; + + for idx in 0..state1.len() { + states.push(arrow::compute::concat(&[ + &state1[idx].to_array()?, + &state2[idx].to_array()?, + ])?); + } + + let mut last_accumulator = + LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; + last_accumulator.merge_batch(&states)?; + + let merged_state = last_accumulator.state()?; + assert_eq!(merged_state.len(), state1.len()); + + Ok(()) + } +} diff --git a/datafusion/aggregate-functions/src/macros.rs b/datafusion/aggregate-functions/src/macros.rs index 000d9d4200cc..af2a98edc473 100644 --- a/datafusion/aggregate-functions/src/macros.rs +++ b/datafusion/aggregate-functions/src/macros.rs @@ -15,36 +15,36 @@ // specific language governing permissions and limitations // under the License. -macro_rules! make_udf_function { - ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $AGGREGATE_UDF_FN:ident) => { - paste::paste! { - // "fluent expr_fn" style function - #[doc = $DOC] - pub fn $EXPR_FN($($arg: Expr),*) -> Expr { - Expr::ScalarFunction(ScalarFunction::new_udf( - $AGGREGATE_UDF_FN(), - vec![$($arg),*], - )) - } +// macro_rules! make_udf_function { +// ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $AGGREGATE_UDF_FN:ident) => { +// paste::paste! { +// // "fluent expr_fn" style function +// #[doc = $DOC] +// pub fn $EXPR_FN($($arg: Expr),*) -> Expr { +// Expr::ScalarFunction(ScalarFunction::new_udf( +// $AGGREGATE_UDF_FN(), +// vec![$($arg),*], +// )) +// } - /// Singleton instance of [`$UDF`], ensures the UDF is only created once - /// named STATIC_$(UDF). For example `STATIC_ArrayToString` - #[allow(non_upper_case_globals)] - static [< STATIC_ $UDF >]: std::sync::OnceLock> = - std::sync::OnceLock::new(); +// /// Singleton instance of [`$UDF`], ensures the UDF is only created once +// /// named STATIC_$(UDF). For example `STATIC_ArrayToString` +// #[allow(non_upper_case_globals)] +// static [< STATIC_ $UDF >]: std::sync::OnceLock> = +// std::sync::OnceLock::new(); - /// ScalarFunction that returns a [`ScalarUDF`] for [`$UDF`] - /// - /// [`ScalarUDF`]: datafusion_expr::ScalarUDF - pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { - [< STATIC_ $UDF >] - .get_or_init(|| { - std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( - <$UDF>::new(), - )) - }) - .clone() - } - } - } -} +// /// ScalarFunction that returns a [`ScalarUDF`] for [`$UDF`] +// /// +// /// [`ScalarUDF`]: datafusion_expr::ScalarUDF +// pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { +// [< STATIC_ $UDF >] +// .get_or_init(|| { +// std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( +// <$UDF>::new(), +// )) +// }) +// .clone() +// } +// } +// } +// } diff --git a/datafusion/physical-expr/src/aggregate/first_last.rs b/datafusion/physical-expr/src/aggregate/first_last.rs deleted file mode 100644 index 0b0b9f398be1..000000000000 --- a/datafusion/physical-expr/src/aggregate/first_last.rs +++ /dev/null @@ -1,689 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines the FIRST_VALUE/LAST_VALUE aggregations. - -use std::any::Any; -use std::sync::Arc; - -use crate::aggregate::utils::{down_cast_any_ref, get_sort_options, ordering_fields}; -use crate::expressions::format_state_name; -use crate::{ - reverse_order_bys, AggregateExpr, LexOrdering, PhysicalExpr, PhysicalSortExpr, -}; - -use arrow::array::{Array, ArrayRef, AsArray}; -use arrow::compute::{lexsort_to_indices, SortColumn}; -use arrow::datatypes::{DataType, Field}; -use datafusion_aggregate_functions::first_last::{ - convert_to_sort_cols, filter_states_according_to_is_set, FirstValueAccumulator, -}; -use datafusion_common::utils::{compare_rows, get_arrayref_at_indices, get_row_at_idx}; -use datafusion_common::{internal_err, Result, ScalarValue}; -use datafusion_expr::Accumulator; - -/// FIRST_VALUE aggregate expression -#[derive(Debug, Clone)] -pub struct FirstValue { - name: String, - input_data_type: DataType, - order_by_data_types: Vec, - expr: Arc, - ordering_req: LexOrdering, - requirement_satisfied: bool, - ignore_nulls: bool, - state_fields: Vec, -} - -impl FirstValue { - /// Creates a new FIRST_VALUE aggregation function. - pub fn new( - expr: Arc, - name: impl Into, - input_data_type: DataType, - ordering_req: LexOrdering, - order_by_data_types: Vec, - state_fields: Vec, - ) -> Self { - let requirement_satisfied = ordering_req.is_empty(); - Self { - name: name.into(), - input_data_type, - order_by_data_types, - expr, - ordering_req, - requirement_satisfied, - ignore_nulls: false, - state_fields, - } - } - - pub fn with_ignore_nulls(mut self, ignore_nulls: bool) -> Self { - self.ignore_nulls = ignore_nulls; - self - } - - /// Returns the name of the aggregate expression. - pub fn name(&self) -> &str { - &self.name - } - - /// Returns the input data type of the aggregate expression. - pub fn input_data_type(&self) -> &DataType { - &self.input_data_type - } - - /// Returns the data types of the order-by columns. - pub fn order_by_data_types(&self) -> &Vec { - &self.order_by_data_types - } - - /// Returns the expression associated with the aggregate function. - pub fn expr(&self) -> &Arc { - &self.expr - } - - /// Returns the lexical ordering requirements of the aggregate expression. - pub fn ordering_req(&self) -> &LexOrdering { - &self.ordering_req - } - - pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { - self.requirement_satisfied = requirement_satisfied; - self - } - - pub fn convert_to_last(self) -> LastValue { - let name = if self.name.starts_with("FIRST") { - format!("LAST{}", &self.name[5..]) - } else { - format!("LAST_VALUE({})", self.expr) - }; - let FirstValue { - expr, - input_data_type, - ordering_req, - order_by_data_types, - .. - } = self; - LastValue::new( - expr, - name, - input_data_type, - reverse_order_bys(&ordering_req), - order_by_data_types, - ) - } -} - -impl AggregateExpr for FirstValue { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn field(&self) -> Result { - Ok(Field::new(&self.name, self.input_data_type.clone(), true)) - } - - fn create_accumulator(&self) -> Result> { - FirstValueAccumulator::try_new( - &self.input_data_type, - &self.order_by_data_types, - self.ordering_req.clone(), - self.ignore_nulls, - ) - .map(|acc| { - Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ - }) - } - - fn state_fields(&self) -> Result> { - if !self.state_fields.is_empty() { - return Ok(self.state_fields.clone()); - } - - let mut fields = vec![Field::new( - format_state_name(&self.name, "first_value"), - self.input_data_type.clone(), - true, - )]; - fields.extend(ordering_fields( - &self.ordering_req, - &self.order_by_data_types, - )); - fields.push(Field::new( - format_state_name(&self.name, "is_set"), - DataType::Boolean, - true, - )); - Ok(fields) - } - - fn expressions(&self) -> Vec> { - vec![self.expr.clone()] - } - - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - (!self.ordering_req.is_empty()).then_some(&self.ordering_req) - } - - fn name(&self) -> &str { - &self.name - } - - fn reverse_expr(&self) -> Option> { - Some(Arc::new(self.clone().convert_to_last())) - } - - fn create_sliding_accumulator(&self) -> Result> { - FirstValueAccumulator::try_new( - &self.input_data_type, - &self.order_by_data_types, - self.ordering_req.clone(), - self.ignore_nulls, - ) - .map(|acc| { - Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ - }) - } -} - -impl PartialEq for FirstValue { - fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) - .downcast_ref::() - .map(|x| { - self.name == x.name - && self.input_data_type == x.input_data_type - && self.order_by_data_types == x.order_by_data_types - && self.expr.eq(&x.expr) - }) - .unwrap_or(false) - } -} -/// LAST_VALUE aggregate expression -#[derive(Debug, Clone)] -pub struct LastValue { - name: String, - input_data_type: DataType, - order_by_data_types: Vec, - expr: Arc, - ordering_req: LexOrdering, - requirement_satisfied: bool, - ignore_nulls: bool, -} - -impl LastValue { - /// Creates a new LAST_VALUE aggregation function. - pub fn new( - expr: Arc, - name: impl Into, - input_data_type: DataType, - ordering_req: LexOrdering, - order_by_data_types: Vec, - ) -> Self { - let requirement_satisfied = ordering_req.is_empty(); - Self { - name: name.into(), - input_data_type, - order_by_data_types, - expr, - ordering_req, - requirement_satisfied, - ignore_nulls: false, - } - } - - pub fn with_ignore_nulls(mut self, ignore_nulls: bool) -> Self { - self.ignore_nulls = ignore_nulls; - self - } - - /// Returns the name of the aggregate expression. - pub fn name(&self) -> &str { - &self.name - } - - /// Returns the input data type of the aggregate expression. - pub fn input_data_type(&self) -> &DataType { - &self.input_data_type - } - - /// Returns the data types of the order-by columns. - pub fn order_by_data_types(&self) -> &Vec { - &self.order_by_data_types - } - - /// Returns the expression associated with the aggregate function. - pub fn expr(&self) -> &Arc { - &self.expr - } - - /// Returns the lexical ordering requirements of the aggregate expression. - pub fn ordering_req(&self) -> &LexOrdering { - &self.ordering_req - } - - pub fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { - self.requirement_satisfied = requirement_satisfied; - self - } - - pub fn convert_to_first(self) -> FirstValue { - let name = if self.name.starts_with("LAST") { - format!("FIRST{}", &self.name[4..]) - } else { - format!("FIRST_VALUE({})", self.expr) - }; - let LastValue { - expr, - input_data_type, - ordering_req, - order_by_data_types, - .. - } = self; - FirstValue::new( - expr, - name, - input_data_type, - reverse_order_bys(&ordering_req), - order_by_data_types, - vec![], - ) - } -} - -impl AggregateExpr for LastValue { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn field(&self) -> Result { - Ok(Field::new(&self.name, self.input_data_type.clone(), true)) - } - - fn create_accumulator(&self) -> Result> { - LastValueAccumulator::try_new( - &self.input_data_type, - &self.order_by_data_types, - self.ordering_req.clone(), - self.ignore_nulls, - ) - .map(|acc| { - Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ - }) - } - - fn state_fields(&self) -> Result> { - let mut fields = vec![Field::new( - format_state_name(&self.name, "last_value"), - self.input_data_type.clone(), - true, - )]; - fields.extend(ordering_fields( - &self.ordering_req, - &self.order_by_data_types, - )); - fields.push(Field::new( - format_state_name(&self.name, "is_set"), - DataType::Boolean, - true, - )); - Ok(fields) - } - - fn expressions(&self) -> Vec> { - vec![self.expr.clone()] - } - - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - (!self.ordering_req.is_empty()).then_some(&self.ordering_req) - } - - fn name(&self) -> &str { - &self.name - } - - fn reverse_expr(&self) -> Option> { - Some(Arc::new(self.clone().convert_to_first())) - } - - fn create_sliding_accumulator(&self) -> Result> { - LastValueAccumulator::try_new( - &self.input_data_type, - &self.order_by_data_types, - self.ordering_req.clone(), - self.ignore_nulls, - ) - .map(|acc| { - Box::new(acc.with_requirement_satisfied(self.requirement_satisfied)) as _ - }) - } -} - -impl PartialEq for LastValue { - fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) - .downcast_ref::() - .map(|x| { - self.name == x.name - && self.input_data_type == x.input_data_type - && self.order_by_data_types == x.order_by_data_types - && self.expr.eq(&x.expr) - }) - .unwrap_or(false) - } -} - -#[derive(Debug)] -struct LastValueAccumulator { - last: ScalarValue, - // The `is_set` flag keeps track of whether the last value is finalized. - // This information is used to discriminate genuine NULLs and NULLS that - // occur due to empty partitions. - is_set: bool, - orderings: Vec, - // Stores the applicable ordering requirement. - ordering_req: LexOrdering, - // Stores whether incoming data already satisfies the ordering requirement. - requirement_satisfied: bool, - // Ignore null values. - ignore_nulls: bool, -} - -impl LastValueAccumulator { - /// Creates a new `LastValueAccumulator` for the given `data_type`. - pub fn try_new( - data_type: &DataType, - ordering_dtypes: &[DataType], - ordering_req: LexOrdering, - ignore_nulls: bool, - ) -> Result { - let orderings = ordering_dtypes - .iter() - .map(ScalarValue::try_from) - .collect::>>()?; - let requirement_satisfied = ordering_req.is_empty(); - ScalarValue::try_from(data_type).map(|last| Self { - last, - is_set: false, - orderings, - ordering_req, - requirement_satisfied, - ignore_nulls, - }) - } - - // Updates state with the values in the given row. - fn update_with_new_row(&mut self, row: &[ScalarValue]) { - self.last = row[0].clone(); - self.orderings = row[1..].to_vec(); - self.is_set = true; - } - - fn get_last_idx(&self, values: &[ArrayRef]) -> Result> { - let [value, ordering_values @ ..] = values else { - return internal_err!("Empty row in LAST_VALUE"); - }; - if self.requirement_satisfied { - // Get last entry according to the order of data: - if self.ignore_nulls { - // If ignoring nulls, find the last non-null value. - for i in (0..value.len()).rev() { - if !value.is_null(i) { - return Ok(Some(i)); - } - } - return Ok(None); - } else { - return Ok((!value.is_empty()).then_some(value.len() - 1)); - } - } - let sort_columns = ordering_values - .iter() - .zip(self.ordering_req.iter()) - .map(|(values, req)| { - // Take the reverse ordering requirement. This enables us to - // use "fetch = 1" to get the last value. - SortColumn { - values: values.clone(), - options: Some(!req.options), - } - }) - .collect::>(); - - if self.ignore_nulls { - let indices = lexsort_to_indices(&sort_columns, None)?; - // If ignoring nulls, find the last non-null value. - for index in indices.iter().flatten() { - if !value.is_null(index as usize) { - return Ok(Some(index as usize)); - } - } - Ok(None) - } else { - let indices = lexsort_to_indices(&sort_columns, Some(1))?; - Ok((!indices.is_empty()).then_some(indices.value(0) as _)) - } - } - - fn with_requirement_satisfied(mut self, requirement_satisfied: bool) -> Self { - self.requirement_satisfied = requirement_satisfied; - self - } -} - -impl Accumulator for LastValueAccumulator { - fn state(&mut self) -> Result> { - let mut result = vec![self.last.clone()]; - result.extend(self.orderings.clone()); - result.push(ScalarValue::Boolean(Some(self.is_set))); - Ok(result) - } - - fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - if !self.is_set || self.requirement_satisfied { - if let Some(last_idx) = self.get_last_idx(values)? { - let row = get_row_at_idx(values, last_idx)?; - self.update_with_new_row(&row); - } - } else if let Some(last_idx) = self.get_last_idx(values)? { - let row = get_row_at_idx(values, last_idx)?; - let orderings = &row[1..]; - // Update when there is a more recent entry - if compare_rows( - &self.orderings, - orderings, - &get_sort_options(&self.ordering_req), - )? - .is_lt() - { - self.update_with_new_row(&row); - } - } - - Ok(()) - } - - fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { - // LAST_VALUE(last1, last2, last3, ...) - // last index contains is_set flag. - let is_set_idx = states.len() - 1; - let flags = states[is_set_idx].as_boolean(); - let filtered_states = filter_states_according_to_is_set(states, flags)?; - // 1..is_set_idx range corresponds to ordering section - let sort_cols = - convert_to_sort_cols(&filtered_states[1..is_set_idx], &self.ordering_req); - - let ordered_states = if sort_cols.is_empty() { - // When no ordering is given, use existing state as is: - filtered_states - } else { - let indices = lexsort_to_indices(&sort_cols, None)?; - get_arrayref_at_indices(&filtered_states, &indices)? - }; - - if !ordered_states[0].is_empty() { - let last_idx = ordered_states[0].len() - 1; - let last_row = get_row_at_idx(&ordered_states, last_idx)?; - // When collecting orderings, we exclude the is_set flag from the state. - let last_ordering = &last_row[1..is_set_idx]; - let sort_options = get_sort_options(&self.ordering_req); - // Either there is no existing value, or there is a newer (latest) - // version in the new data: - if !self.is_set - || compare_rows(&self.orderings, last_ordering, &sort_options)?.is_lt() - { - // Update with last value in the state. Note that we should exclude the - // is_set flag from the state. Otherwise, we will end up with a state - // containing two is_set flags. - self.update_with_new_row(&last_row[0..is_set_idx]); - } - } - Ok(()) - } - - fn evaluate(&mut self) -> Result { - Ok(self.last.clone()) - } - - fn size(&self) -> usize { - std::mem::size_of_val(self) - std::mem::size_of_val(&self.last) - + self.last.size() - + ScalarValue::size_of_vec(&self.orderings) - - std::mem::size_of_val(&self.orderings) - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use crate::aggregate::first_last::{FirstValueAccumulator, LastValueAccumulator}; - - use arrow::compute::concat; - use arrow_array::{ArrayRef, Int64Array}; - use arrow_schema::DataType; - use datafusion_common::{Result, ScalarValue}; - use datafusion_expr::Accumulator; - - #[test] - fn test_first_last_value_value() -> Result<()> { - let mut first_accumulator = - FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - let mut last_accumulator = - LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - // first value in the tuple is start of the range (inclusive), - // second value in the tuple is end of the range (exclusive) - let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)]; - // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12 - let arrs = ranges - .into_iter() - .map(|(start, end)| { - Arc::new(Int64Array::from((start..end).collect::>())) as ArrayRef - }) - .collect::>(); - for arr in arrs { - // Once first_value is set, accumulator should remember it. - // It shouldn't update first_value for each new batch - first_accumulator.update_batch(&[arr.clone()])?; - // last_value should be updated for each new batch. - last_accumulator.update_batch(&[arr])?; - } - // First Value comes from the first value of the first batch which is 0 - assert_eq!(first_accumulator.evaluate()?, ScalarValue::Int64(Some(0))); - // Last value comes from the last value of the last batch which is 12 - assert_eq!(last_accumulator.evaluate()?, ScalarValue::Int64(Some(12))); - Ok(()) - } - - #[test] - fn test_first_last_state_after_merge() -> Result<()> { - let ranges: Vec<(i64, i64)> = vec![(0, 10), (1, 11), (2, 13)]; - // create 3 ArrayRefs between each interval e.g from 0 to 9, 1 to 10, 2 to 12 - let arrs = ranges - .into_iter() - .map(|(start, end)| { - Arc::new((start..end).collect::()) as ArrayRef - }) - .collect::>(); - - // FirstValueAccumulator - let mut first_accumulator = - FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - - first_accumulator.update_batch(&[arrs[0].clone()])?; - let state1 = first_accumulator.state()?; - - let mut first_accumulator = - FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - first_accumulator.update_batch(&[arrs[1].clone()])?; - let state2 = first_accumulator.state()?; - - assert_eq!(state1.len(), state2.len()); - - let mut states = vec![]; - - for idx in 0..state1.len() { - states.push(concat(&[ - &state1[idx].to_array()?, - &state2[idx].to_array()?, - ])?); - } - - let mut first_accumulator = - FirstValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - first_accumulator.merge_batch(&states)?; - - let merged_state = first_accumulator.state()?; - assert_eq!(merged_state.len(), state1.len()); - - // LastValueAccumulator - let mut last_accumulator = - LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - - last_accumulator.update_batch(&[arrs[0].clone()])?; - let state1 = last_accumulator.state()?; - - let mut last_accumulator = - LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - last_accumulator.update_batch(&[arrs[1].clone()])?; - let state2 = last_accumulator.state()?; - - assert_eq!(state1.len(), state2.len()); - - let mut states = vec![]; - - for idx in 0..state1.len() { - states.push(concat(&[ - &state1[idx].to_array()?, - &state2[idx].to_array()?, - ])?); - } - - let mut last_accumulator = - LastValueAccumulator::try_new(&DataType::Int64, &[], vec![], false)?; - last_accumulator.merge_batch(&states)?; - - let merged_state = last_accumulator.state()?; - assert_eq!(merged_state.len(), state1.len()); - - Ok(()) - } -} diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index e176084ae6ec..eff008e8f825 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -38,7 +38,6 @@ pub(crate) mod correlation; pub(crate) mod count; pub(crate) mod count_distinct; pub(crate) mod covariance; -pub(crate) mod first_last; pub(crate) mod grouping; pub(crate) mod median; pub(crate) mod nth_value; diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index f0cc4b175ea5..651a205e4143 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -53,7 +53,6 @@ pub use crate::aggregate::correlation::Correlation; pub use crate::aggregate::count::Count; pub use crate::aggregate::count_distinct::DistinctCount; pub use crate::aggregate::covariance::{Covariance, CovariancePop}; -pub use crate::aggregate::first_last::{FirstValue, LastValue}; pub use crate::aggregate::grouping::Grouping; pub use crate::aggregate::median::Median; pub use crate::aggregate::min_max::{Max, Min}; @@ -76,6 +75,9 @@ pub use crate::window::rank::{dense_rank, percent_rank, rank}; pub use crate::window::rank::{Rank, RankType}; pub use crate::window::row_number::RowNumber; pub use crate::PhysicalSortExpr; +pub use datafusion_aggregate_functions::first_last::{ + FirstValuePhysicalExpr as FirstValue, LastValuePhysicalExpr as LastValue, +}; pub use binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index f8ad03bf6d97..98c44e23c6c7 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1235,7 +1235,7 @@ mod tests { use datafusion_execution::memory_pool::FairSpillPool; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_physical_expr::expressions::{ - lit, ApproxDistinct, Count, FirstValue, LastValue, Median, OrderSensitiveArrayAgg, + lit, ApproxDistinct, Count, LastValue, Median, OrderSensitiveArrayAgg, }; use datafusion_physical_expr::{ reverse_order_bys, AggregateExpr, EquivalenceProperties, PhysicalExpr, From 109b790eeadbc45828b00076da8764f853b7bc61 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:18:14 +0800 Subject: [PATCH 20/38] use macro Signed-off-by: jayzhan211 --- datafusion/aggregate-functions/Cargo.toml | 1 + .../aggregate-functions/src/first_last.rs | 15 ++--- datafusion/aggregate-functions/src/lib.rs | 19 +++--- datafusion/aggregate-functions/src/macros.rs | 67 ++++++++++--------- .../tests/cases/roundtrip_logical_plan.rs | 1 + 5 files changed, 54 insertions(+), 49 deletions(-) diff --git a/datafusion/aggregate-functions/Cargo.toml b/datafusion/aggregate-functions/Cargo.toml index cd20b37ad922..a71094aed288 100644 --- a/datafusion/aggregate-functions/Cargo.toml +++ b/datafusion/aggregate-functions/Cargo.toml @@ -41,3 +41,4 @@ datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } log = { workspace = true } +paste = "1.0.14" diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs index 5b2c2ff62e61..7aafb46caf96 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -43,14 +43,13 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -// TODO: macro udaf -// make_udf_function!( -// FirstValue, -// first_value, -// value: Expr, -// "Returns the first value in a group of values.", -// first_value_fn -// ); +make_udaf_function!(FirstValue, + first_value, + value, + "Returns the first value in a group of values.", + first_value_udaf, + create_first_value_accumulator +); pub struct FirstValue { signature: Signature, diff --git a/datafusion/aggregate-functions/src/lib.rs b/datafusion/aggregate-functions/src/lib.rs index 15d17e0805d8..def52a40e6b1 100644 --- a/datafusion/aggregate-functions/src/lib.rs +++ b/datafusion/aggregate-functions/src/lib.rs @@ -79,28 +79,27 @@ //! [`ScalarUDF`]: datafusion_expr::ScalarUDF use std::sync::Arc; +#[macro_use] +pub mod macros; + use datafusion_common::Result; use datafusion_execution::FunctionRegistry; use datafusion_expr::AggregateUDF; -use first_last::{create_first_value_accumulator, FirstValue}; use log::debug; pub mod first_last; pub mod utils; -#[macro_use] -pub mod macros; - /// Fluent-style API for creating `Expr`s -pub mod expr_fn {} +pub mod expr_fn { + pub use super::first_last::first_value; +} /// Registers all enabled packages with a [`FunctionRegistry`] pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { - // TODO: macro this creation - let accumulator = Arc::new(create_first_value_accumulator); - let first_value = AggregateUDF::from(FirstValue::new(accumulator)); - - let functions: Vec> = vec![first_value.into()]; + let functions: Vec> = vec![ + first_last::first_value_udaf() + ]; functions.into_iter().try_for_each(|udf| { let existing_udaf = registry.register_udaf(udf)?; diff --git a/datafusion/aggregate-functions/src/macros.rs b/datafusion/aggregate-functions/src/macros.rs index af2a98edc473..cdeeb45b7d03 100644 --- a/datafusion/aggregate-functions/src/macros.rs +++ b/datafusion/aggregate-functions/src/macros.rs @@ -15,36 +15,41 @@ // specific language governing permissions and limitations // under the License. -// macro_rules! make_udf_function { -// ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , $AGGREGATE_UDF_FN:ident) => { -// paste::paste! { -// // "fluent expr_fn" style function -// #[doc = $DOC] -// pub fn $EXPR_FN($($arg: Expr),*) -> Expr { -// Expr::ScalarFunction(ScalarFunction::new_udf( -// $AGGREGATE_UDF_FN(), -// vec![$($arg),*], -// )) -// } +macro_rules! make_udaf_function { + ($UDAF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $AGGREGATE_UDF_FN:ident, $ACCUMULATOR:ident) => { + paste::paste! { + // "fluent expr_fn" style function + #[doc = $DOC] + pub fn $EXPR_FN($($arg: Expr),*) -> Expr { + Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction::new_udf( + $AGGREGATE_UDF_FN(), + vec![$($arg),*], + false, + None, + None, + None, + )) + } -// /// Singleton instance of [`$UDF`], ensures the UDF is only created once -// /// named STATIC_$(UDF). For example `STATIC_ArrayToString` -// #[allow(non_upper_case_globals)] -// static [< STATIC_ $UDF >]: std::sync::OnceLock> = -// std::sync::OnceLock::new(); + /// Singleton instance of [$UDAF], ensures the UDF is only created once + /// named STATIC_$(UDAF). For example `STATIC_FirstValue` + #[allow(non_upper_case_globals)] + static [< STATIC_ $UDAF >]: std::sync::OnceLock> = + std::sync::OnceLock::new(); -// /// ScalarFunction that returns a [`ScalarUDF`] for [`$UDF`] -// /// -// /// [`ScalarUDF`]: datafusion_expr::ScalarUDF -// pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { -// [< STATIC_ $UDF >] -// .get_or_init(|| { -// std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( -// <$UDF>::new(), -// )) -// }) -// .clone() -// } -// } -// } -// } + /// Aggregatefunction that returns a [AggregateUDF] for [$UDAF] + /// + /// [AggregateUDF]: datafusion_expr::AggregateUDF + pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { + [< STATIC_ $UDAF >] + .get_or_init(|| { + + let accumulator = std::sync::Arc::new($ACCUMULATOR); + std::sync::Arc::new(datafusion_expr::AggregateUDF::from(<$UDAF>::new(accumulator))) + + }) + .clone() + } + } + } +} diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index f136e314559b..e72f6d02e1c7 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -612,6 +612,7 @@ async fn roundtrip_expr_api() -> Result<()> { lit(1), ), array_replace_all(make_array(vec![lit(1), lit(2), lit(3)]), lit(2), lit(4)), + // TODO: Add first value after built-in functions are deprecated ]; // ensure expressions created with the expr api can be round tripped From 87d589fd44db218a01335ae07a5db4e1c890bf3c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:18:28 +0800 Subject: [PATCH 21/38] fmt Signed-off-by: jayzhan211 --- datafusion/aggregate-functions/src/first_last.rs | 3 ++- datafusion/aggregate-functions/src/lib.rs | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs index 7aafb46caf96..11d526e411d1 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -43,7 +43,8 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -make_udaf_function!(FirstValue, +make_udaf_function!( + FirstValue, first_value, value, "Returns the first value in a group of values.", diff --git a/datafusion/aggregate-functions/src/lib.rs b/datafusion/aggregate-functions/src/lib.rs index def52a40e6b1..df880ca1bacc 100644 --- a/datafusion/aggregate-functions/src/lib.rs +++ b/datafusion/aggregate-functions/src/lib.rs @@ -97,9 +97,7 @@ pub mod expr_fn { /// Registers all enabled packages with a [`FunctionRegistry`] pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { - let functions: Vec> = vec![ - first_last::first_value_udaf() - ]; + let functions: Vec> = vec![first_last::first_value_udaf()]; functions.into_iter().try_for_each(|udf| { let existing_udaf = registry.register_udaf(udf)?; From 398e4e2cb1d3e426d2320e04b1de177253c43622 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:46:19 +0800 Subject: [PATCH 22/38] fix doc Signed-off-by: jayzhan211 --- datafusion/aggregate-functions/src/lib.rs | 32 +++-------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/datafusion/aggregate-functions/src/lib.rs b/datafusion/aggregate-functions/src/lib.rs index df880ca1bacc..be2c7a991fa2 100644 --- a/datafusion/aggregate-functions/src/lib.rs +++ b/datafusion/aggregate-functions/src/lib.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Function packages for [DataFusion]. +//! Aggregate Function packages for [DataFusion]. //! -//! This crate contains a collection of various function packages for DataFusion, +//! This crate contains a collection of various aggregate function packages for DataFusion, //! implemented using the extension API. Users may wish to control which functions //! are available to control the binary size of their application as well as //! use dialect specific implementations of functions (e.g. Spark vs Postgres) @@ -33,33 +33,9 @@ //! # Using A Package //! You can register all functions in all packages using the [`register_all`] function. //! -//! To access and use only the functions in a certain package, use the -//! `functions()` method in each module. -//! -//! ``` -//! # fn main() -> datafusion_common::Result<()> { -//! # let mut registry = datafusion_execution::registry::MemoryFunctionRegistry::new(); -//! # use datafusion_execution::FunctionRegistry; -//! // get the encoding functions -//! use datafusion_functions::encoding; -//! for udf in encoding::functions() { -//! registry.register_udf(udf)?; -//! } -//! # Ok(()) -//! # } -//! ``` -//! //! Each package also exports an `expr_fn` submodule to help create [`Expr`]s that invoke //! functions using a fluent style. For example: //! -//! ``` -//! // create an Expr that will invoke the encode function -//! use datafusion_expr::{col, lit}; -//! use datafusion_functions::expr_fn; -//! // Equivalent to "encode(my_data, 'hex')" in SQL: -//! let expr = expr_fn::encode(col("my_data"), lit("hex")); -//! ``` -//! //![`Expr`]: datafusion_expr::Expr //! //! # Implementing A New Package @@ -67,7 +43,7 @@ //! To add a new package to this crate, you should follow the model of existing //! packages. The high level steps are: //! -//! 1. Create a new module with the appropriate [`ScalarUDF`] implementations. +//! 1. Create a new module with the appropriate [AggregateUDF] implementations. //! //! 2. Use the macros in [`macros`] to create standard entry points. //! @@ -75,8 +51,6 @@ //! //! 4. Use the `make_package!` macro to expose the module when the //! feature is enabled. -//! -//! [`ScalarUDF`]: datafusion_expr::ScalarUDF use std::sync::Arc; #[macro_use] From 04c7f5e134a6a04a483c95309bb9e2c774187b47 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:48:01 +0800 Subject: [PATCH 23/38] add todo Signed-off-by: jayzhan211 --- datafusion/aggregate-functions/src/macros.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/aggregate-functions/src/macros.rs b/datafusion/aggregate-functions/src/macros.rs index cdeeb45b7d03..a44a9e21ee05 100644 --- a/datafusion/aggregate-functions/src/macros.rs +++ b/datafusion/aggregate-functions/src/macros.rs @@ -24,6 +24,7 @@ macro_rules! make_udaf_function { Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction::new_udf( $AGGREGATE_UDF_FN(), vec![$($arg),*], + // TODO: Support arguments for `expr` API false, None, None, From 01a1ddf9ed8b906ceab79c725503124b3bb2d339 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:49:24 +0800 Subject: [PATCH 24/38] rm comments Signed-off-by: jayzhan211 --- datafusion/aggregate-functions/src/first_last.rs | 8 -------- datafusion/aggregate-functions/src/lib.rs | 1 - datafusion/aggregate-functions/src/utils.rs | 16 ---------------- 3 files changed, 25 deletions(-) delete mode 100644 datafusion/aggregate-functions/src/utils.rs diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs index 11d526e411d1..82254be5c7c3 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -120,14 +120,6 @@ impl AggregateUDFImpl for FirstValue { } } -// /// Creates a new UDAF with a specific signature, state type and return type. -// /// The signature and state type must match the `Accumulator's implementation`. -// /// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then. -// pub fn create_first_value() -> AggregateUDF { -// let accumulator = Arc::new(create_first_value_accumulator); -// AggregateUDF::from(FirstValue::new(accumulator)) -// } - pub(crate) fn create_first_value_accumulator( acc_args: AccumulatorArgs, ) -> Result> { diff --git a/datafusion/aggregate-functions/src/lib.rs b/datafusion/aggregate-functions/src/lib.rs index be2c7a991fa2..cab85ebcfb79 100644 --- a/datafusion/aggregate-functions/src/lib.rs +++ b/datafusion/aggregate-functions/src/lib.rs @@ -62,7 +62,6 @@ use datafusion_expr::AggregateUDF; use log::debug; pub mod first_last; -pub mod utils; /// Fluent-style API for creating `Expr`s pub mod expr_fn { diff --git a/datafusion/aggregate-functions/src/utils.rs b/datafusion/aggregate-functions/src/utils.rs deleted file mode 100644 index b248758bc120..000000000000 --- a/datafusion/aggregate-functions/src/utils.rs +++ /dev/null @@ -1,16 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. From 4871414ec8b6c9b0a6ce1a0ed5246c0bd4b5e5c5 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:51:44 +0800 Subject: [PATCH 25/38] rm unused Signed-off-by: jayzhan211 --- datafusion/expr/src/expr_fn.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index cb454b4e6cad..611b498ead34 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -708,17 +708,6 @@ pub fn create_udaf( )) } -// /// Creates a new UDAF with a specific signature, state type and return type. -// /// The signature and state type must match the `Accumulator's implementation`. -// /// TOOD: We plan to move aggregate function to its own crate. This function will be deprecated then. -// pub fn create_first_value( -// name: &str, -// signature: Signature, -// accumulator: AccumulatorFactoryFunction, -// ) -> AggregateUDF { -// AggregateUDF::from(FirstValue::new(name, signature, accumulator)) -// } - /// Implements [`AggregateUDFImpl`] for functions that have a single signature and /// return type. pub struct SimpleAggregateUDF { From 1ef212bd662c822328ebd919f04dc0bda1af45b4 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:54:36 +0800 Subject: [PATCH 26/38] rm unused code Signed-off-by: jayzhan211 --- datafusion/expr/src/expr_fn.rs | 73 ---------------------------------- 1 file changed, 73 deletions(-) diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 611b498ead34..f68685a87f13 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -24,7 +24,6 @@ use crate::expr::{ use crate::function::{ AccumulatorArgs, AccumulatorFactoryFunction, PartitionEvaluatorFactory, }; -use crate::utils::format_state_name; use crate::{ aggregate_function, built_in_function, conditional_expressions::CaseBuilder, logical_plan::Subquery, AggregateUDF, BuiltinScalarFunction, Expr, LogicalPlan, @@ -802,78 +801,6 @@ impl AggregateUDFImpl for SimpleAggregateUDF { } } -pub struct FirstValue { - name: String, - signature: Signature, - accumulator: AccumulatorFactoryFunction, -} - -impl Debug for FirstValue { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_struct("FirstValue") - .field("name", &self.name) - .field("signature", &self.signature) - .field("accumulator", &"") - .finish() - } -} - -impl FirstValue { - pub fn new( - name: impl Into, - signature: Signature, - accumulator: AccumulatorFactoryFunction, - ) -> Self { - let name = name.into(); - Self { - name, - signature, - accumulator, - } - } -} - -impl AggregateUDFImpl for FirstValue { - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - &self.name - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn return_type(&self, arg_types: &[DataType]) -> Result { - Ok(arg_types[0].clone()) - } - - fn accumulator( - &self, - acc_args: AccumulatorArgs, - ) -> Result> { - (self.accumulator)(acc_args) - } - - fn state_fields( - &self, - name: &str, - value_type: DataType, - ordering_fields: Vec, - ) -> Result> { - let mut fields = vec![Field::new( - format_state_name(name, "first_value"), - value_type, - true, - )]; - fields.extend(ordering_fields); - fields.push(Field::new("is_set", DataType::Boolean, true)); - Ok(fields) - } -} - /// Creates a new UDWF with a specific signature, state type and return type. /// /// The signature and state type must match the [`PartitionEvaluator`]'s implementation`. From b6d53a5227bcebf25de4e4c7dc28e3e91994abdb Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 21:58:50 +0800 Subject: [PATCH 27/38] change to private Signed-off-by: jayzhan211 --- datafusion/aggregate-functions/src/first_last.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/aggregate-functions/src/first_last.rs index 82254be5c7c3..e1aaff8ac93d 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/aggregate-functions/src/first_last.rs @@ -877,9 +877,7 @@ impl Accumulator for LastValueAccumulator { /// Filters states according to the `is_set` flag at the last column and returns /// the resulting states. -/// -/// TODO: This function can be private once the `LAST_VALUE` function is moved to the `aggregate-functions` crate. -pub fn filter_states_according_to_is_set( +fn filter_states_according_to_is_set( states: &[ArrayRef], flags: &BooleanArray, ) -> Result> { @@ -890,9 +888,7 @@ pub fn filter_states_according_to_is_set( } /// Combines array refs and their corresponding orderings to construct `SortColumn`s. -/// -/// TODO: This function can be private once the `LAST_VALUE` function is moved to the `aggregate-functions` crate. -pub fn convert_to_sort_cols( +fn convert_to_sort_cols( arrs: &[ArrayRef], sort_exprs: &[PhysicalSortExpr], ) -> Vec { From 9aa15a208323a1171d7ef8827d98ed1390c19e9d Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Fri, 5 Apr 2024 22:12:22 +0800 Subject: [PATCH 28/38] fix lock Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 6f5f035ed300..e393f23c9929 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1176,6 +1176,7 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr-common", "log", + "paste", ] [[package]] From e90464bdbd4afdbcdc16e37d921cd0a13a2ae6ce Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Apr 2024 07:42:20 +0800 Subject: [PATCH 29/38] cleanup Signed-off-by: jayzhan211 --- Cargo.toml | 4 ++-- datafusion/core/Cargo.toml | 2 +- datafusion/core/src/execution/context/mod.rs | 7 ++++--- datafusion/core/src/lib.rs | 5 +++++ .../Cargo.toml | 4 ++-- .../src/first_last.rs | 4 ++-- .../src/lib.rs | 0 .../src/macros.rs | 4 ++-- datafusion/physical-expr/Cargo.toml | 2 +- datafusion/physical-expr/src/expressions/mod.rs | 2 +- 10 files changed, 20 insertions(+), 14 deletions(-) rename datafusion/{aggregate-functions => functions-aggregate}/Cargo.toml (95%) rename datafusion/{aggregate-functions => functions-aggregate}/src/first_last.rs (99%) rename datafusion/{aggregate-functions => functions-aggregate}/src/lib.rs (100%) rename datafusion/{aggregate-functions => functions-aggregate}/src/macros.rs (93%) diff --git a/Cargo.toml b/Cargo.toml index 1864afbef81b..3af3db6f6626 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ members = [ "datafusion/core", "datafusion/expr", "datafusion/execution", - "datafusion/aggregate-functions", + "datafusion/functions-aggregate", "datafusion/functions", "datafusion/functions-array", "datafusion/optimizer", @@ -74,12 +74,12 @@ chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.4.0" datafusion = { path = "datafusion/core", version = "37.0.0", default-features = false } -datafusion-aggregate-functions = { path = "datafusion/aggregate-functions", version = "37.0.0" } datafusion-common = { path = "datafusion/common", version = "37.0.0", default-features = false } datafusion-common-runtime = { path = "datafusion/common-runtime", version = "37.0.0" } datafusion-execution = { path = "datafusion/execution", version = "37.0.0" } datafusion-expr = { path = "datafusion/expr", version = "37.0.0" } datafusion-functions = { path = "datafusion/functions", version = "37.0.0" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "37.0.0" } datafusion-functions-array = { path = "datafusion/functions-array", version = "37.0.0" } datafusion-optimizer = { path = "datafusion/optimizer", version = "37.0.0", default-features = false } datafusion-physical-expr = { path = "datafusion/physical-expr", version = "37.0.0", default-features = false } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 2e45fb8f4905..26ad4a15ee14 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -93,7 +93,7 @@ bytes = { workspace = true } bzip2 = { version = "0.4.3", optional = true } chrono = { workspace = true } dashmap = { workspace = true } -datafusion-aggregate-functions = { workspace = true } +datafusion-functions-aggregate = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } datafusion-common-runtime = { workspace = true } datafusion-execution = { workspace = true } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 79eb3caacc77..7f4c47d2fe56 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -24,6 +24,7 @@ use std::string::String; use std::sync::{Arc, Weak}; use super::options::ReadOptions; +use crate::{functions, functions_aggregate, functions_array}; use crate::{ catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}, catalog::listing_schema::ListingSchemaProvider, @@ -1448,15 +1449,15 @@ impl SessionState { }; // register built in functions - datafusion_functions::register_all(&mut new_self) + functions::register_all(&mut new_self) .expect("can not register built in functions"); // register crate of array expressions (if enabled) #[cfg(feature = "array_expressions")] - datafusion_functions_array::register_all(&mut new_self) + functions_array::register_all(&mut new_self) .expect("can not register array expressions"); - datafusion_aggregate_functions::register_all(&mut new_self) + functions_aggregate::register_all(&mut new_self) .expect("can not register aggregate functions"); new_self diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index f6e2171d6b5f..93eafb8d776e 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -538,6 +538,11 @@ pub mod functions_array { pub use datafusion_functions_array::*; } +/// re-export of [`datafusion_functions_aggregate`] crate +pub mod functions_aggregate { + pub use datafusion_functions_aggregate::*; +} + #[cfg(test)] pub mod test; pub mod test_util; diff --git a/datafusion/aggregate-functions/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml similarity index 95% rename from datafusion/aggregate-functions/Cargo.toml rename to datafusion/functions-aggregate/Cargo.toml index a71094aed288..d42932d8abdd 100644 --- a/datafusion/aggregate-functions/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -16,7 +16,7 @@ # under the License. [package] -name = "datafusion-aggregate-functions" +name = "datafusion-functions-aggregate" description = "Aggregate function packages for the DataFusion query engine" keywords = ["datafusion", "logical", "plan", "expressions"] readme = "README.md" @@ -29,7 +29,7 @@ authors = { workspace = true } rust-version = { workspace = true } [lib] -name = "datafusion_aggregate_functions" +name = "datafusion_functions_aggregate" path = "src/lib.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/datafusion/aggregate-functions/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs similarity index 99% rename from datafusion/aggregate-functions/src/first_last.rs rename to datafusion/functions-aggregate/src/first_last.rs index e1aaff8ac93d..b369c49ecd23 100644 --- a/datafusion/aggregate-functions/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -339,7 +339,7 @@ impl Accumulator for FirstValueAccumulator { } } -/// TO BE DEPRECATED: Builtin FIRST_VALUE physical aggregate expression +/// TO BE DEPRECATED: Builtin FIRST_VALUE physical aggregate expression will be replaced by udf in the future #[derive(Debug, Clone)] pub struct FirstValuePhysicalExpr { name: String, @@ -520,7 +520,7 @@ impl PartialEq for FirstValuePhysicalExpr { } } -/// TO BE DEPRECATED: Builtin LAST_VALUE physical aggregate expression +/// TO BE DEPRECATED: Builtin LAST_VALUE physical aggregate expression will be replaced by udf in the future #[derive(Debug, Clone)] pub struct LastValuePhysicalExpr { name: String, diff --git a/datafusion/aggregate-functions/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs similarity index 100% rename from datafusion/aggregate-functions/src/lib.rs rename to datafusion/functions-aggregate/src/lib.rs diff --git a/datafusion/aggregate-functions/src/macros.rs b/datafusion/functions-aggregate/src/macros.rs similarity index 93% rename from datafusion/aggregate-functions/src/macros.rs rename to datafusion/functions-aggregate/src/macros.rs index a44a9e21ee05..7e0a84dd3c29 100644 --- a/datafusion/aggregate-functions/src/macros.rs +++ b/datafusion/functions-aggregate/src/macros.rs @@ -32,13 +32,13 @@ macro_rules! make_udaf_function { )) } - /// Singleton instance of [$UDAF], ensures the UDF is only created once + /// Singleton instance of [$UDAF], ensures the UDAF is only created once /// named STATIC_$(UDAF). For example `STATIC_FirstValue` #[allow(non_upper_case_globals)] static [< STATIC_ $UDAF >]: std::sync::OnceLock> = std::sync::OnceLock::new(); - /// Aggregatefunction that returns a [AggregateUDF] for [$UDAF] + /// AggregateFunction that returns a [AggregateUDF] for [$UDAF] /// /// [AggregateUDF]: datafusion_expr::AggregateUDF pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 352e7e44a9cd..40159e00014a 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -56,7 +56,7 @@ base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { workspace = true } -datafusion-aggregate-functions = { workspace = true } +datafusion-functions-aggregate = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 651a205e4143..d0da792aff18 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -75,7 +75,7 @@ pub use crate::window::rank::{dense_rank, percent_rank, rank}; pub use crate::window::rank::{Rank, RankType}; pub use crate::window::row_number::RowNumber; pub use crate::PhysicalSortExpr; -pub use datafusion_aggregate_functions::first_last::{ +pub use datafusion_functions_aggregate::first_last::{ FirstValuePhysicalExpr as FirstValue, LastValuePhysicalExpr as LastValue, }; From ece925f5ac6e469289f671e328365e3293ed2c30 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Apr 2024 07:52:41 +0800 Subject: [PATCH 30/38] cleanup Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 30 +++--- datafusion/core/Cargo.toml | 2 +- datafusion/core/src/execution/context/mod.rs | 2 +- .../functions-aggregate/src/first_last.rs | 102 +++++++++--------- datafusion/functions-aggregate/src/macros.rs | 7 +- datafusion/physical-expr/Cargo.toml | 2 +- 6 files changed, 68 insertions(+), 77 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index e393f23c9929..350c26445616 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1130,12 +1130,12 @@ dependencies = [ "bzip2", "chrono", "dashmap", - "datafusion-aggregate-functions", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", "datafusion-functions", + "datafusion-functions-aggregate", "datafusion-functions-array", "datafusion-optimizer", "datafusion-physical-expr", @@ -1166,19 +1166,6 @@ dependencies = [ "zstd 0.13.0", ] -[[package]] -name = "datafusion-aggregate-functions" -version = "37.0.0" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr-common", - "log", - "paste", -] - [[package]] name = "datafusion-cli" version = "37.0.0" @@ -1293,6 +1280,19 @@ dependencies = [ "uuid", ] +[[package]] +name = "datafusion-functions-aggregate" +version = "37.0.0" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + [[package]] name = "datafusion-functions-array" version = "37.0.0" @@ -1342,10 +1342,10 @@ dependencies = [ "blake2", "blake3", "chrono", - "datafusion-aggregate-functions", "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate", "datafusion-physical-expr-common", "half", "hashbrown 0.14.3", diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 26ad4a15ee14..4f18cb5cb74d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -93,12 +93,12 @@ bytes = { workspace = true } bzip2 = { version = "0.4.3", optional = true } chrono = { workspace = true } dashmap = { workspace = true } -datafusion-functions-aggregate = { workspace = true } datafusion-common = { workspace = true, features = ["object_store"] } datafusion-common-runtime = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-functions = { workspace = true } +datafusion-functions-aggregate = { workspace = true } datafusion-functions-array = { workspace = true, optional = true } datafusion-optimizer = { workspace = true } datafusion-physical-expr = { workspace = true } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 7f4c47d2fe56..f56f11fb8856 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -24,7 +24,6 @@ use std::string::String; use std::sync::{Arc, Weak}; use super::options::ReadOptions; -use crate::{functions, functions_aggregate, functions_array}; use crate::{ catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}, catalog::listing_schema::ListingSchemaProvider, @@ -58,6 +57,7 @@ use crate::{ physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, variable::{VarProvider, VarType}, }; +use crate::{functions, functions_aggregate, functions_array}; use arrow::datatypes::{DataType, SchemaRef}; use arrow::record_batch::RecordBatch; diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index b369c49ecd23..d5367ad34163 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -27,10 +27,7 @@ use datafusion_common::{ use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{ - Accumulator, AccumulatorFactoryFunction, AggregateUDFImpl, Expr, Signature, - Volatility, -}; +use datafusion_expr::{Accumulator, AggregateUDFImpl, Expr, Signature, Volatility}; use datafusion_physical_expr_common::aggregate::utils::{ down_cast_any_ref, get_sort_options, ordering_fields, }; @@ -48,14 +45,12 @@ make_udaf_function!( first_value, value, "Returns the first value in a group of values.", - first_value_udaf, - create_first_value_accumulator + first_value_udaf ); pub struct FirstValue { signature: Signature, aliases: Vec, - accumulator: AccumulatorFactoryFunction, } impl Debug for FirstValue { @@ -68,12 +63,17 @@ impl Debug for FirstValue { } } +impl Default for FirstValue { + fn default() -> Self { + Self::new() + } +} + impl FirstValue { - pub fn new(accumulator: AccumulatorFactoryFunction) -> Self { + pub fn new() -> Self { Self { aliases: vec![String::from("FIRST_VALUE")], signature: Signature::uniform(1, NUMERICS.to_vec(), Volatility::Immutable), - accumulator, } } } @@ -96,7 +96,45 @@ impl AggregateUDFImpl for FirstValue { } fn accumulator(&self, acc_args: AccumulatorArgs) -> Result> { - (self.accumulator)(acc_args) + let mut all_sort_orders = vec![]; + + // Construct PhysicalSortExpr objects from Expr objects: + let mut sort_exprs = vec![]; + for expr in acc_args.sort_exprs { + if let Expr::Sort(sort) = expr { + if let Expr::Column(col) = sort.expr.as_ref() { + let name = &col.name; + let e = expressions::column::col(name, acc_args.schema)?; + sort_exprs.push(PhysicalSortExpr { + expr: e, + options: SortOptions { + descending: !sort.asc, + nulls_first: sort.nulls_first, + }, + }); + } + } + } + if !sort_exprs.is_empty() { + all_sort_orders.extend(sort_exprs); + } + + let ordering_req = all_sort_orders; + + let ordering_dtypes = ordering_req + .iter() + .map(|e| e.expr.data_type(acc_args.schema)) + .collect::>>()?; + + let requirement_satisfied = ordering_req.is_empty(); + + FirstValueAccumulator::try_new( + acc_args.data_type, + &ordering_dtypes, + ordering_req, + acc_args.ignore_nulls, + ) + .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _) } fn state_fields( @@ -120,50 +158,6 @@ impl AggregateUDFImpl for FirstValue { } } -pub(crate) fn create_first_value_accumulator( - acc_args: AccumulatorArgs, -) -> Result> { - let mut all_sort_orders = vec![]; - - // Construct PhysicalSortExpr objects from Expr objects: - let mut sort_exprs = vec![]; - for expr in acc_args.sort_exprs { - if let Expr::Sort(sort) = expr { - if let Expr::Column(col) = sort.expr.as_ref() { - let name = &col.name; - let e = expressions::column::col(name, acc_args.schema)?; - sort_exprs.push(PhysicalSortExpr { - expr: e, - options: SortOptions { - descending: !sort.asc, - nulls_first: sort.nulls_first, - }, - }); - } - } - } - if !sort_exprs.is_empty() { - all_sort_orders.extend(sort_exprs); - } - - let ordering_req = all_sort_orders; - - let ordering_dtypes = ordering_req - .iter() - .map(|e| e.expr.data_type(acc_args.schema)) - .collect::>>()?; - - let requirement_satisfied = ordering_req.is_empty(); - - FirstValueAccumulator::try_new( - acc_args.data_type, - &ordering_dtypes, - ordering_req, - acc_args.ignore_nulls, - ) - .map(|acc| Box::new(acc.with_requirement_satisfied(requirement_satisfied)) as _) -} - #[derive(Debug)] pub struct FirstValueAccumulator { first: ScalarValue, diff --git a/datafusion/functions-aggregate/src/macros.rs b/datafusion/functions-aggregate/src/macros.rs index 7e0a84dd3c29..d24c60f93270 100644 --- a/datafusion/functions-aggregate/src/macros.rs +++ b/datafusion/functions-aggregate/src/macros.rs @@ -16,7 +16,7 @@ // under the License. macro_rules! make_udaf_function { - ($UDAF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $AGGREGATE_UDF_FN:ident, $ACCUMULATOR:ident) => { + ($UDAF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $AGGREGATE_UDF_FN:ident) => { paste::paste! { // "fluent expr_fn" style function #[doc = $DOC] @@ -44,10 +44,7 @@ macro_rules! make_udaf_function { pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc { [< STATIC_ $UDAF >] .get_or_init(|| { - - let accumulator = std::sync::Arc::new($ACCUMULATOR); - std::sync::Arc::new(datafusion_expr::AggregateUDF::from(<$UDAF>::new(accumulator))) - + std::sync::Arc::new(datafusion_expr::AggregateUDF::from(<$UDAF>::default())) }) .clone() } diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 40159e00014a..72fac5370ae0 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -56,10 +56,10 @@ base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { workspace = true } -datafusion-functions-aggregate = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-functions-aggregate = { workspace = true } datafusion-physical-expr-common = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } From 89ccc89dce9a43f3258d43ba827bfd4c72ffdd57 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Apr 2024 07:57:51 +0800 Subject: [PATCH 31/38] support roundtrip Signed-off-by: jayzhan211 --- datafusion/proto/tests/cases/roundtrip_logical_plan.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index e72f6d02e1c7..e680a1b2ff1e 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -30,6 +30,7 @@ use datafusion::datasource::provider::TableProviderFactory; use datafusion::datasource::TableProvider; use datafusion::execution::context::SessionState; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::functions_aggregate::expr_fn::first_value; use datafusion::prelude::*; use datafusion::test_util::{TestTableFactory, TestTableProvider}; use datafusion_common::config::{FormatOptions, TableOptions}; @@ -612,7 +613,7 @@ async fn roundtrip_expr_api() -> Result<()> { lit(1), ), array_replace_all(make_array(vec![lit(1), lit(2), lit(3)]), lit(2), lit(4)), - // TODO: Add first value after built-in functions are deprecated + first_value(lit(1)), ]; // ensure expressions created with the expr api can be round tripped From 41a830a366b3e3efed5c432f8f691e4b51548ce6 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Apr 2024 08:01:05 +0800 Subject: [PATCH 32/38] remmove old format state Signed-off-by: jayzhan211 --- datafusion/expr/src/utils.rs | 3 +-- datafusion/physical-expr/src/expressions/mod.rs | 6 +----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 8add56dd349f..a93282574e8a 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -1240,8 +1240,7 @@ pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema { } } -/// Construct state name. State is the intermidiate state of the aggregate function. -/// TODO: Remove duplicated function in physical-expr +/// Build state name. State is the intermidiate state of the aggregate function. pub fn format_state_name(name: &str, state_name: &str) -> String { format!("{name}[{state_name}]") } diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index d0da792aff18..688d5ce6eabf 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -83,6 +83,7 @@ pub use binary::{binary, BinaryExpr}; pub use case::{case, CaseExpr}; pub use cast::{cast, cast_with_options, CastExpr}; pub use column::UnKnownColumn; +pub use datafusion_expr::utils::format_state_name; pub use datafusion_physical_expr_common::expressions::column::{col, Column}; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; @@ -94,11 +95,6 @@ pub use no_op::NoOp; pub use not::{not, NotExpr}; pub use try_cast::{try_cast, TryCastExpr}; -/// returns the name of the state -pub fn format_state_name(name: &str, state_name: &str) -> String { - format!("{name}[{state_name}]") -} - #[cfg(test)] pub(crate) mod tests { use std::sync::Arc; From d235d2a970cf5591fe92bfca23b7f9b38b84c29c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Apr 2024 14:37:23 +0800 Subject: [PATCH 33/38] move aggregate related things to aggr crate Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 1 + datafusion/core/src/execution/context/mod.rs | 3 +- .../functions-aggregate/src/first_last.rs | 6 +- datafusion/functions-aggregate/src/lib.rs | 277 +++++++++++++++++- .../src}/utils.rs | 5 +- .../physical-expr-common/src/aggregate/mod.rs | 102 ------- datafusion/physical-expr-common/src/lib.rs | 1 - datafusion/physical-expr/src/aggregate/mod.rs | 2 +- .../physical-expr/src/aggregate/utils.rs | 6 +- datafusion/physical-expr/src/lib.rs | 2 +- datafusion/physical-plan/Cargo.toml | 1 + datafusion/physical-plan/src/lib.rs | 5 +- datafusion/physical-plan/src/udaf.rs | 218 -------------- 13 files changed, 289 insertions(+), 340 deletions(-) rename datafusion/{physical-expr-common/src/aggregate => functions-aggregate/src}/utils.rs (96%) delete mode 100644 datafusion/physical-expr-common/src/aggregate/mod.rs delete mode 100644 datafusion/physical-plan/src/udaf.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 350c26445616..b2a7c425f579 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1385,6 +1385,7 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate", "datafusion-physical-expr", "futures", "half", diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index f56f11fb8856..8fc60770105b 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -44,6 +44,7 @@ use crate::{ datasource::{provider_as_source, MemTable, TableProvider, ViewTable}, error::{DataFusionError, Result}, execution::{options::ArrowReadOptions, runtime_env::RuntimeEnv, FunctionRegistry}, + logical_expr::AggregateUDF, logical_expr::{ CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction, CreateMemoryTable, CreateView, DropCatalogSchema, DropFunction, DropTable, @@ -53,7 +54,7 @@ use crate::{ optimizer::analyzer::{Analyzer, AnalyzerRule}, optimizer::optimizer::{Optimizer, OptimizerConfig, OptimizerRule}, physical_optimizer::optimizer::{PhysicalOptimizer, PhysicalOptimizerRule}, - physical_plan::{udaf::AggregateUDF, udf::ScalarUDF, ExecutionPlan}, + physical_plan::{udf::ScalarUDF, ExecutionPlan}, physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, variable::{VarProvider, VarType}, }; diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index d5367ad34163..988520adb9c3 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -17,6 +17,8 @@ //! Defines the FIRST_VALUE/LAST_VALUE aggregations. +use crate::utils::{down_cast_any_ref, get_sort_options, ordering_fields}; +use crate::AggregateExpr; use arrow::array::{ArrayRef, AsArray, BooleanArray}; use arrow::compute::{self, lexsort_to_indices, SortColumn, SortOptions}; use arrow::datatypes::{DataType, Field}; @@ -28,10 +30,6 @@ use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; use datafusion_expr::{Accumulator, AggregateUDFImpl, Expr, Signature, Volatility}; -use datafusion_physical_expr_common::aggregate::utils::{ - down_cast_any_ref, get_sort_options, ordering_fields, -}; -use datafusion_physical_expr_common::aggregate::AggregateExpr; use datafusion_physical_expr_common::expressions; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs index cab85ebcfb79..cea556fcec7a 100644 --- a/datafusion/functions-aggregate/src/lib.rs +++ b/datafusion/functions-aggregate/src/lib.rs @@ -51,17 +51,27 @@ //! //! 4. Use the `make_package!` macro to expose the module when the //! feature is enabled. -use std::sync::Arc; #[macro_use] pub mod macros; -use datafusion_common::Result; +pub mod first_last; +pub mod utils; + +use arrow::datatypes::{DataType, Field, Schema}; +use datafusion_common::{not_impl_err, Result}; use datafusion_execution::FunctionRegistry; -use datafusion_expr::AggregateUDF; +use datafusion_expr::{ + function::AccumulatorArgs, Accumulator, AggregateUDF, Expr, GroupsAccumulator, +}; +use datafusion_physical_expr_common::{ + physical_expr::PhysicalExpr, + sort_expr::{LexOrdering, PhysicalSortExpr}, +}; use log::debug; - -pub mod first_last; +use std::fmt::Debug; +use std::{any::Any, sync::Arc}; +use utils::{down_cast_any_ref, ordering_fields}; /// Fluent-style API for creating `Expr`s pub mod expr_fn { @@ -82,3 +92,260 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { Ok(()) } + +/// Creates a physical expression of the UDAF, that includes all necessary type coercion. +/// This function errors when `args`' can't be coerced to a valid argument type of the UDAF. +pub fn create_aggregate_expr( + fun: &AggregateUDF, + input_phy_exprs: &[Arc], + sort_exprs: &[Expr], + ordering_req: &[PhysicalSortExpr], + schema: &Schema, + name: impl Into, + ignore_nulls: bool, +) -> Result> { + let input_exprs_types = input_phy_exprs + .iter() + .map(|arg| arg.data_type(schema)) + .collect::>>()?; + + let ordering_types = ordering_req + .iter() + .map(|e| e.expr.data_type(schema)) + .collect::>>()?; + + let ordering_fields = ordering_fields(ordering_req, &ordering_types); + + Ok(Arc::new(AggregateFunctionExpr { + fun: fun.clone(), + args: input_phy_exprs.to_vec(), + data_type: fun.return_type(&input_exprs_types)?, + name: name.into(), + schema: schema.clone(), + sort_exprs: sort_exprs.to_vec(), + ordering_req: ordering_req.to_vec(), + ignore_nulls, + ordering_fields, + })) +} + +/// An aggregate expression that: +/// * knows its resulting field +/// * knows how to create its accumulator +/// * knows its accumulator's state's field +/// * knows the expressions from whose its accumulator will receive values +/// +/// Any implementation of this trait also needs to implement the +/// `PartialEq` to allows comparing equality between the +/// trait objects. +pub trait AggregateExpr: Send + Sync + Debug + PartialEq { + /// Returns the aggregate expression as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + /// the field of the final result of this aggregation. + fn field(&self) -> Result; + + /// the accumulator used to accumulate values from the expressions. + /// the accumulator expects the same number of arguments as `expressions` and must + /// return states with the same description as `state_fields` + fn create_accumulator(&self) -> Result>; + + /// the fields that encapsulate the Accumulator's state + /// the number of fields here equals the number of states that the accumulator contains + fn state_fields(&self) -> Result>; + + /// expressions that are passed to the Accumulator. + /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. + fn expressions(&self) -> Vec>; + + /// Order by requirements for the aggregate function + /// By default it is `None` (there is no requirement) + /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this + fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + None + } + + /// Human readable name such as `"MIN(c2)"`. The default + /// implementation returns placeholder text. + fn name(&self) -> &str { + "AggregateExpr: default name" + } + + /// If the aggregate expression has a specialized + /// [`GroupsAccumulator`] implementation. If this returns true, + /// `[Self::create_groups_accumulator`] will be called. + fn groups_accumulator_supported(&self) -> bool { + false + } + + /// Return a specialized [`GroupsAccumulator`] that manages state + /// for all groups. + /// + /// For maximum performance, a [`GroupsAccumulator`] should be + /// implemented in addition to [`Accumulator`]. + fn create_groups_accumulator(&self) -> Result> { + not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") + } + + /// Construct an expression that calculates the aggregate in reverse. + /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). + /// For aggregates that do not support calculation in reverse, + /// returns None (which is the default value). + fn reverse_expr(&self) -> Option> { + None + } + + /// Creates accumulator implementation that supports retract + fn create_sliding_accumulator(&self) -> Result> { + not_impl_err!("Retractable Accumulator hasn't been implemented for {self:?} yet") + } +} + +/// Physical aggregate expression of a UDAF. +#[derive(Debug)] +pub struct AggregateFunctionExpr { + fun: AggregateUDF, + args: Vec>, + /// Output / return type of this aggregate + data_type: DataType, + name: String, + schema: Schema, + // The logical order by expressions + sort_exprs: Vec, + // The physical order by expressions + ordering_req: LexOrdering, + ignore_nulls: bool, + ordering_fields: Vec, +} + +impl AggregateFunctionExpr { + /// Return the `AggregateUDF` used by this `AggregateFunctionExpr` + pub fn fun(&self) -> &AggregateUDF { + &self.fun + } +} + +impl AggregateExpr for AggregateFunctionExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn expressions(&self) -> Vec> { + self.args.clone() + } + + fn state_fields(&self) -> Result> { + self.fun.state_fields( + self.name(), + self.data_type.clone(), + self.ordering_fields.clone(), + ) + } + + fn field(&self) -> Result { + Ok(Field::new(&self.name, self.data_type.clone(), true)) + } + + fn create_accumulator(&self) -> Result> { + let acc_args = AccumulatorArgs::new( + &self.data_type, + &self.schema, + self.ignore_nulls, + &self.sort_exprs, + ); + + self.fun.accumulator(acc_args) + } + + fn create_sliding_accumulator(&self) -> Result> { + let accumulator = self.create_accumulator()?; + + // Accumulators that have window frame startings different + // than `UNBOUNDED PRECEDING`, such as `1 PRECEEDING`, need to + // implement retract_batch method in order to run correctly + // currently in DataFusion. + // + // If this `retract_batches` is not present, there is no way + // to calculate result correctly. For example, the query + // + // ```sql + // SELECT + // SUM(a) OVER(ORDER BY a ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS sum_a + // FROM + // t + // ``` + // + // 1. First sum value will be the sum of rows between `[0, 1)`, + // + // 2. Second sum value will be the sum of rows between `[0, 2)` + // + // 3. Third sum value will be the sum of rows between `[1, 3)`, etc. + // + // Since the accumulator keeps the running sum: + // + // 1. First sum we add to the state sum value between `[0, 1)` + // + // 2. Second sum we add to the state sum value between `[1, 2)` + // (`[0, 1)` is already in the state sum, hence running sum will + // cover `[0, 2)` range) + // + // 3. Third sum we add to the state sum value between `[2, 3)` + // (`[0, 2)` is already in the state sum). Also we need to + // retract values between `[0, 1)` by this way we can obtain sum + // between [1, 3) which is indeed the apropriate range. + // + // When we use `UNBOUNDED PRECEDING` in the query starting + // index will always be 0 for the desired range, and hence the + // `retract_batch` method will not be called. In this case + // having retract_batch is not a requirement. + // + // This approach is a a bit different than window function + // approach. In window function (when they use a window frame) + // they get all the desired range during evaluation. + if !accumulator.supports_retract_batch() { + return not_impl_err!( + "Aggregate can not be used as a sliding accumulator because \ + `retract_batch` is not implemented: {}", + self.name + ); + } + Ok(accumulator) + } + + fn name(&self) -> &str { + &self.name + } + + fn groups_accumulator_supported(&self) -> bool { + self.fun.groups_accumulator_supported() + } + + fn create_groups_accumulator(&self) -> Result> { + self.fun.create_groups_accumulator() + } + + fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + (!self.ordering_req.is_empty()).then_some(&self.ordering_req) + } +} + +impl PartialEq for AggregateFunctionExpr { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| { + self.name == x.name + && self.data_type == x.data_type + && self.fun == x.fun + && self.args.len() == x.args.len() + && self + .args + .iter() + .zip(x.args.iter()) + .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) + }) + .unwrap_or(false) + } +} diff --git a/datafusion/physical-expr-common/src/aggregate/utils.rs b/datafusion/functions-aggregate/src/utils.rs similarity index 96% rename from datafusion/physical-expr-common/src/aggregate/utils.rs rename to datafusion/functions-aggregate/src/utils.rs index 9821ba626b18..989db83cf4f6 100644 --- a/datafusion/physical-expr-common/src/aggregate/utils.rs +++ b/datafusion/functions-aggregate/src/utils.rs @@ -21,10 +21,9 @@ use arrow::{ compute::SortOptions, datatypes::{DataType, Field}, }; +use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; -use crate::sort_expr::PhysicalSortExpr; - -use super::AggregateExpr; +use crate::AggregateExpr; /// Downcast a `Box` or `Arc` /// and return the inner trait object as [`Any`] so diff --git a/datafusion/physical-expr-common/src/aggregate/mod.rs b/datafusion/physical-expr-common/src/aggregate/mod.rs deleted file mode 100644 index 579f51815d84..000000000000 --- a/datafusion/physical-expr-common/src/aggregate/mod.rs +++ /dev/null @@ -1,102 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -pub mod utils; - -use std::any::Any; -use std::fmt::Debug; -use std::sync::Arc; - -use crate::physical_expr::PhysicalExpr; -use crate::sort_expr::PhysicalSortExpr; - -use arrow::datatypes::Field; -use datafusion_common::{not_impl_err, Result}; -use datafusion_expr::{Accumulator, GroupsAccumulator}; - -/// An aggregate expression that: -/// * knows its resulting field -/// * knows how to create its accumulator -/// * knows its accumulator's state's field -/// * knows the expressions from whose its accumulator will receive values -/// -/// Any implementation of this trait also needs to implement the -/// `PartialEq` to allows comparing equality between the -/// trait objects. -pub trait AggregateExpr: Send + Sync + Debug + PartialEq { - /// Returns the aggregate expression as [`Any`] so that it can be - /// downcast to a specific implementation. - fn as_any(&self) -> &dyn Any; - - /// the field of the final result of this aggregation. - fn field(&self) -> Result; - - /// the accumulator used to accumulate values from the expressions. - /// the accumulator expects the same number of arguments as `expressions` and must - /// return states with the same description as `state_fields` - fn create_accumulator(&self) -> Result>; - - /// the fields that encapsulate the Accumulator's state - /// the number of fields here equals the number of states that the accumulator contains - fn state_fields(&self) -> Result>; - - /// expressions that are passed to the Accumulator. - /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. - fn expressions(&self) -> Vec>; - - /// Order by requirements for the aggregate function - /// By default it is `None` (there is no requirement) - /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - /// Human readable name such as `"MIN(c2)"`. The default - /// implementation returns placeholder text. - fn name(&self) -> &str { - "AggregateExpr: default name" - } - - /// If the aggregate expression has a specialized - /// [`GroupsAccumulator`] implementation. If this returns true, - /// `[Self::create_groups_accumulator`] will be called. - fn groups_accumulator_supported(&self) -> bool { - false - } - - /// Return a specialized [`GroupsAccumulator`] that manages state - /// for all groups. - /// - /// For maximum performance, a [`GroupsAccumulator`] should be - /// implemented in addition to [`Accumulator`]. - fn create_groups_accumulator(&self) -> Result> { - not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") - } - - /// Construct an expression that calculates the aggregate in reverse. - /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). - /// For aggregates that do not support calculation in reverse, - /// returns None (which is the default value). - fn reverse_expr(&self) -> Option> { - None - } - - /// Creates accumulator implementation that supports retract - fn create_sliding_accumulator(&self) -> Result> { - not_impl_err!("Retractable Accumulator hasn't been implemented for {self:?} yet") - } -} diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index 53e3134a1b05..3fe11a246b32 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -pub mod aggregate; pub mod expressions; pub mod physical_expr; pub mod sort_expr; diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index eff008e8f825..40eeadfa4199 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use crate::expressions::{NthValueAgg, OrderSensitiveArrayAgg}; -pub use datafusion_physical_expr_common::aggregate::AggregateExpr; +pub use datafusion_functions_aggregate::AggregateExpr; mod hyperloglog; mod tdigest; diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs index d14a52f5752d..51b1363183f3 100644 --- a/datafusion/physical-expr/src/aggregate/utils.rs +++ b/datafusion/physical-expr/src/aggregate/utils.rs @@ -20,9 +20,9 @@ use std::sync::Arc; // For backwards compatibility -pub use datafusion_physical_expr_common::aggregate::utils::down_cast_any_ref; -pub use datafusion_physical_expr_common::aggregate::utils::get_sort_options; -pub use datafusion_physical_expr_common::aggregate::utils::ordering_fields; +pub use datafusion_functions_aggregate::utils::down_cast_any_ref; +pub use datafusion_functions_aggregate::utils::get_sort_options; +pub use datafusion_functions_aggregate::utils::ordering_fields; use arrow::array::{ArrayRef, ArrowNativeTypeOp}; use arrow_array::cast::AsArray; diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 7b81e8f8a5c4..65882c23b694 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -41,7 +41,7 @@ pub mod execution_props { pub use aggregate::groups_accumulator::{GroupsAccumulatorAdapter, NullState}; pub use analysis::{analyze, AnalysisContext, ExprBoundaries}; -pub use datafusion_physical_expr_common::aggregate::AggregateExpr; +pub use datafusion_functions_aggregate::AggregateExpr; pub use equivalence::EquivalenceProperties; pub use partitioning::{Distribution, Partitioning}; pub use physical_expr::{ diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 1ba32bff746e..b76c557066d8 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -46,6 +46,7 @@ datafusion-common = { workspace = true, default-features = true } datafusion-common-runtime = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-functions-aggregate = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } futures = { workspace = true } half = { workspace = true } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 3decf2e34015..59166365b91d 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -66,7 +66,6 @@ pub mod sorts; pub mod stream; pub mod streaming; pub mod tree_node; -pub mod udaf; pub mod union; pub mod unnest; pub mod values; @@ -91,6 +90,10 @@ pub use datafusion_physical_expr::{ // Backwards compatibility pub use crate::stream::EmptyRecordBatchStream; pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; +pub mod udaf { + pub use datafusion_functions_aggregate::create_aggregate_expr; + pub use datafusion_functions_aggregate::AggregateFunctionExpr; +} /// Represent nodes in the DataFusion Physical Plan. /// diff --git a/datafusion/physical-plan/src/udaf.rs b/datafusion/physical-plan/src/udaf.rs deleted file mode 100644 index 74a5603c0c81..000000000000 --- a/datafusion/physical-plan/src/udaf.rs +++ /dev/null @@ -1,218 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This module contains functions and structs supporting user-defined aggregate functions. - -use datafusion_expr::function::AccumulatorArgs; -use datafusion_expr::{Expr, GroupsAccumulator}; -use fmt::Debug; -use std::any::Any; -use std::fmt; - -use arrow::datatypes::{DataType, Field, Schema}; - -use super::{Accumulator, AggregateExpr}; -use datafusion_common::{not_impl_err, Result}; -pub use datafusion_expr::AggregateUDF; -use datafusion_physical_expr::{LexOrdering, PhysicalExpr, PhysicalSortExpr}; - -use datafusion_physical_expr::aggregate::utils::{down_cast_any_ref, ordering_fields}; -use std::sync::Arc; - -/// Creates a physical expression of the UDAF, that includes all necessary type coercion. -/// This function errors when `args`' can't be coerced to a valid argument type of the UDAF. -pub fn create_aggregate_expr( - fun: &AggregateUDF, - input_phy_exprs: &[Arc], - sort_exprs: &[Expr], - ordering_req: &[PhysicalSortExpr], - schema: &Schema, - name: impl Into, - ignore_nulls: bool, -) -> Result> { - let input_exprs_types = input_phy_exprs - .iter() - .map(|arg| arg.data_type(schema)) - .collect::>>()?; - - let ordering_types = ordering_req - .iter() - .map(|e| e.expr.data_type(schema)) - .collect::>>()?; - - let ordering_fields = ordering_fields(ordering_req, &ordering_types); - - Ok(Arc::new(AggregateFunctionExpr { - fun: fun.clone(), - args: input_phy_exprs.to_vec(), - data_type: fun.return_type(&input_exprs_types)?, - name: name.into(), - schema: schema.clone(), - sort_exprs: sort_exprs.to_vec(), - ordering_req: ordering_req.to_vec(), - ignore_nulls, - ordering_fields, - })) -} - -/// Physical aggregate expression of a UDAF. -#[derive(Debug)] -pub struct AggregateFunctionExpr { - fun: AggregateUDF, - args: Vec>, - /// Output / return type of this aggregate - data_type: DataType, - name: String, - schema: Schema, - // The logical order by expressions - sort_exprs: Vec, - // The physical order by expressions - ordering_req: LexOrdering, - ignore_nulls: bool, - ordering_fields: Vec, -} - -impl AggregateFunctionExpr { - /// Return the `AggregateUDF` used by this `AggregateFunctionExpr` - pub fn fun(&self) -> &AggregateUDF { - &self.fun - } -} - -impl AggregateExpr for AggregateFunctionExpr { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn expressions(&self) -> Vec> { - self.args.clone() - } - - fn state_fields(&self) -> Result> { - self.fun.state_fields( - self.name(), - self.data_type.clone(), - self.ordering_fields.clone(), - ) - } - - fn field(&self) -> Result { - Ok(Field::new(&self.name, self.data_type.clone(), true)) - } - - fn create_accumulator(&self) -> Result> { - let acc_args = AccumulatorArgs::new( - &self.data_type, - &self.schema, - self.ignore_nulls, - &self.sort_exprs, - ); - - self.fun.accumulator(acc_args) - } - - fn create_sliding_accumulator(&self) -> Result> { - let accumulator = self.create_accumulator()?; - - // Accumulators that have window frame startings different - // than `UNBOUNDED PRECEDING`, such as `1 PRECEEDING`, need to - // implement retract_batch method in order to run correctly - // currently in DataFusion. - // - // If this `retract_batches` is not present, there is no way - // to calculate result correctly. For example, the query - // - // ```sql - // SELECT - // SUM(a) OVER(ORDER BY a ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS sum_a - // FROM - // t - // ``` - // - // 1. First sum value will be the sum of rows between `[0, 1)`, - // - // 2. Second sum value will be the sum of rows between `[0, 2)` - // - // 3. Third sum value will be the sum of rows between `[1, 3)`, etc. - // - // Since the accumulator keeps the running sum: - // - // 1. First sum we add to the state sum value between `[0, 1)` - // - // 2. Second sum we add to the state sum value between `[1, 2)` - // (`[0, 1)` is already in the state sum, hence running sum will - // cover `[0, 2)` range) - // - // 3. Third sum we add to the state sum value between `[2, 3)` - // (`[0, 2)` is already in the state sum). Also we need to - // retract values between `[0, 1)` by this way we can obtain sum - // between [1, 3) which is indeed the apropriate range. - // - // When we use `UNBOUNDED PRECEDING` in the query starting - // index will always be 0 for the desired range, and hence the - // `retract_batch` method will not be called. In this case - // having retract_batch is not a requirement. - // - // This approach is a a bit different than window function - // approach. In window function (when they use a window frame) - // they get all the desired range during evaluation. - if !accumulator.supports_retract_batch() { - return not_impl_err!( - "Aggregate can not be used as a sliding accumulator because \ - `retract_batch` is not implemented: {}", - self.name - ); - } - Ok(accumulator) - } - - fn name(&self) -> &str { - &self.name - } - - fn groups_accumulator_supported(&self) -> bool { - self.fun.groups_accumulator_supported() - } - - fn create_groups_accumulator(&self) -> Result> { - self.fun.create_groups_accumulator() - } - - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - (!self.ordering_req.is_empty()).then_some(&self.ordering_req) - } -} - -impl PartialEq for AggregateFunctionExpr { - fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) - .downcast_ref::() - .map(|x| { - self.name == x.name - && self.data_type == x.data_type - && self.fun == x.fun - && self.args.len() == x.args.len() - && self - .args - .iter() - .zip(x.args.iter()) - .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) - }) - .unwrap_or(false) - } -} From 51cd27258161f361a1ef60743950be0b94c5799c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Apr 2024 20:06:16 +0800 Subject: [PATCH 34/38] move back to common Signed-off-by: jayzhan211 --- .../functions-aggregate/src/first_last.rs | 6 +- datafusion/functions-aggregate/src/lib.rs | 273 +--------------- datafusion/functions-array/Cargo.toml | 1 + datafusion/functions-array/src/macros.rs | 1 + .../physical-expr-common/src/aggregate/mod.rs | 293 ++++++++++++++++++ .../src/aggregate}/utils.rs | 5 +- datafusion/physical-expr-common/src/lib.rs | 1 + datafusion/physical-expr/src/aggregate/mod.rs | 2 +- .../physical-expr/src/aggregate/utils.rs | 6 +- datafusion/physical-expr/src/lib.rs | 2 +- datafusion/physical-plan/Cargo.toml | 1 + datafusion/physical-plan/src/lib.rs | 5 +- 12 files changed, 315 insertions(+), 281 deletions(-) create mode 100644 datafusion/physical-expr-common/src/aggregate/mod.rs rename datafusion/{functions-aggregate/src => physical-expr-common/src/aggregate}/utils.rs (96%) diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 988520adb9c3..d5367ad34163 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -17,8 +17,6 @@ //! Defines the FIRST_VALUE/LAST_VALUE aggregations. -use crate::utils::{down_cast_any_ref, get_sort_options, ordering_fields}; -use crate::AggregateExpr; use arrow::array::{ArrayRef, AsArray, BooleanArray}; use arrow::compute::{self, lexsort_to_indices, SortColumn, SortOptions}; use arrow::datatypes::{DataType, Field}; @@ -30,6 +28,10 @@ use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; use datafusion_expr::{Accumulator, AggregateUDFImpl, Expr, Signature, Volatility}; +use datafusion_physical_expr_common::aggregate::utils::{ + down_cast_any_ref, get_sort_options, ordering_fields, +}; +use datafusion_physical_expr_common::aggregate::AggregateExpr; use datafusion_physical_expr_common::expressions; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs index cea556fcec7a..8016b76889f7 100644 --- a/datafusion/functions-aggregate/src/lib.rs +++ b/datafusion/functions-aggregate/src/lib.rs @@ -56,22 +56,12 @@ pub mod macros; pub mod first_last; -pub mod utils; -use arrow::datatypes::{DataType, Field, Schema}; -use datafusion_common::{not_impl_err, Result}; +use datafusion_common::Result; use datafusion_execution::FunctionRegistry; -use datafusion_expr::{ - function::AccumulatorArgs, Accumulator, AggregateUDF, Expr, GroupsAccumulator, -}; -use datafusion_physical_expr_common::{ - physical_expr::PhysicalExpr, - sort_expr::{LexOrdering, PhysicalSortExpr}, -}; +use datafusion_expr::AggregateUDF; use log::debug; -use std::fmt::Debug; -use std::{any::Any, sync::Arc}; -use utils::{down_cast_any_ref, ordering_fields}; +use std::sync::Arc; /// Fluent-style API for creating `Expr`s pub mod expr_fn { @@ -92,260 +82,3 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { Ok(()) } - -/// Creates a physical expression of the UDAF, that includes all necessary type coercion. -/// This function errors when `args`' can't be coerced to a valid argument type of the UDAF. -pub fn create_aggregate_expr( - fun: &AggregateUDF, - input_phy_exprs: &[Arc], - sort_exprs: &[Expr], - ordering_req: &[PhysicalSortExpr], - schema: &Schema, - name: impl Into, - ignore_nulls: bool, -) -> Result> { - let input_exprs_types = input_phy_exprs - .iter() - .map(|arg| arg.data_type(schema)) - .collect::>>()?; - - let ordering_types = ordering_req - .iter() - .map(|e| e.expr.data_type(schema)) - .collect::>>()?; - - let ordering_fields = ordering_fields(ordering_req, &ordering_types); - - Ok(Arc::new(AggregateFunctionExpr { - fun: fun.clone(), - args: input_phy_exprs.to_vec(), - data_type: fun.return_type(&input_exprs_types)?, - name: name.into(), - schema: schema.clone(), - sort_exprs: sort_exprs.to_vec(), - ordering_req: ordering_req.to_vec(), - ignore_nulls, - ordering_fields, - })) -} - -/// An aggregate expression that: -/// * knows its resulting field -/// * knows how to create its accumulator -/// * knows its accumulator's state's field -/// * knows the expressions from whose its accumulator will receive values -/// -/// Any implementation of this trait also needs to implement the -/// `PartialEq` to allows comparing equality between the -/// trait objects. -pub trait AggregateExpr: Send + Sync + Debug + PartialEq { - /// Returns the aggregate expression as [`Any`] so that it can be - /// downcast to a specific implementation. - fn as_any(&self) -> &dyn Any; - - /// the field of the final result of this aggregation. - fn field(&self) -> Result; - - /// the accumulator used to accumulate values from the expressions. - /// the accumulator expects the same number of arguments as `expressions` and must - /// return states with the same description as `state_fields` - fn create_accumulator(&self) -> Result>; - - /// the fields that encapsulate the Accumulator's state - /// the number of fields here equals the number of states that the accumulator contains - fn state_fields(&self) -> Result>; - - /// expressions that are passed to the Accumulator. - /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. - fn expressions(&self) -> Vec>; - - /// Order by requirements for the aggregate function - /// By default it is `None` (there is no requirement) - /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - None - } - - /// Human readable name such as `"MIN(c2)"`. The default - /// implementation returns placeholder text. - fn name(&self) -> &str { - "AggregateExpr: default name" - } - - /// If the aggregate expression has a specialized - /// [`GroupsAccumulator`] implementation. If this returns true, - /// `[Self::create_groups_accumulator`] will be called. - fn groups_accumulator_supported(&self) -> bool { - false - } - - /// Return a specialized [`GroupsAccumulator`] that manages state - /// for all groups. - /// - /// For maximum performance, a [`GroupsAccumulator`] should be - /// implemented in addition to [`Accumulator`]. - fn create_groups_accumulator(&self) -> Result> { - not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") - } - - /// Construct an expression that calculates the aggregate in reverse. - /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). - /// For aggregates that do not support calculation in reverse, - /// returns None (which is the default value). - fn reverse_expr(&self) -> Option> { - None - } - - /// Creates accumulator implementation that supports retract - fn create_sliding_accumulator(&self) -> Result> { - not_impl_err!("Retractable Accumulator hasn't been implemented for {self:?} yet") - } -} - -/// Physical aggregate expression of a UDAF. -#[derive(Debug)] -pub struct AggregateFunctionExpr { - fun: AggregateUDF, - args: Vec>, - /// Output / return type of this aggregate - data_type: DataType, - name: String, - schema: Schema, - // The logical order by expressions - sort_exprs: Vec, - // The physical order by expressions - ordering_req: LexOrdering, - ignore_nulls: bool, - ordering_fields: Vec, -} - -impl AggregateFunctionExpr { - /// Return the `AggregateUDF` used by this `AggregateFunctionExpr` - pub fn fun(&self) -> &AggregateUDF { - &self.fun - } -} - -impl AggregateExpr for AggregateFunctionExpr { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn expressions(&self) -> Vec> { - self.args.clone() - } - - fn state_fields(&self) -> Result> { - self.fun.state_fields( - self.name(), - self.data_type.clone(), - self.ordering_fields.clone(), - ) - } - - fn field(&self) -> Result { - Ok(Field::new(&self.name, self.data_type.clone(), true)) - } - - fn create_accumulator(&self) -> Result> { - let acc_args = AccumulatorArgs::new( - &self.data_type, - &self.schema, - self.ignore_nulls, - &self.sort_exprs, - ); - - self.fun.accumulator(acc_args) - } - - fn create_sliding_accumulator(&self) -> Result> { - let accumulator = self.create_accumulator()?; - - // Accumulators that have window frame startings different - // than `UNBOUNDED PRECEDING`, such as `1 PRECEEDING`, need to - // implement retract_batch method in order to run correctly - // currently in DataFusion. - // - // If this `retract_batches` is not present, there is no way - // to calculate result correctly. For example, the query - // - // ```sql - // SELECT - // SUM(a) OVER(ORDER BY a ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS sum_a - // FROM - // t - // ``` - // - // 1. First sum value will be the sum of rows between `[0, 1)`, - // - // 2. Second sum value will be the sum of rows between `[0, 2)` - // - // 3. Third sum value will be the sum of rows between `[1, 3)`, etc. - // - // Since the accumulator keeps the running sum: - // - // 1. First sum we add to the state sum value between `[0, 1)` - // - // 2. Second sum we add to the state sum value between `[1, 2)` - // (`[0, 1)` is already in the state sum, hence running sum will - // cover `[0, 2)` range) - // - // 3. Third sum we add to the state sum value between `[2, 3)` - // (`[0, 2)` is already in the state sum). Also we need to - // retract values between `[0, 1)` by this way we can obtain sum - // between [1, 3) which is indeed the apropriate range. - // - // When we use `UNBOUNDED PRECEDING` in the query starting - // index will always be 0 for the desired range, and hence the - // `retract_batch` method will not be called. In this case - // having retract_batch is not a requirement. - // - // This approach is a a bit different than window function - // approach. In window function (when they use a window frame) - // they get all the desired range during evaluation. - if !accumulator.supports_retract_batch() { - return not_impl_err!( - "Aggregate can not be used as a sliding accumulator because \ - `retract_batch` is not implemented: {}", - self.name - ); - } - Ok(accumulator) - } - - fn name(&self) -> &str { - &self.name - } - - fn groups_accumulator_supported(&self) -> bool { - self.fun.groups_accumulator_supported() - } - - fn create_groups_accumulator(&self) -> Result> { - self.fun.create_groups_accumulator() - } - - fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { - (!self.ordering_req.is_empty()).then_some(&self.ordering_req) - } -} - -impl PartialEq for AggregateFunctionExpr { - fn eq(&self, other: &dyn Any) -> bool { - down_cast_any_ref(other) - .downcast_ref::() - .map(|x| { - self.name == x.name - && self.data_type == x.data_type - && self.fun == x.fun - && self.args.len() == x.args.len() - && self - .args - .iter() - .zip(x.args.iter()) - .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) - }) - .unwrap_or(false) - } -} diff --git a/datafusion/functions-array/Cargo.toml b/datafusion/functions-array/Cargo.toml index 6ef9c6b055af..8257f9db8c46 100644 --- a/datafusion/functions-array/Cargo.toml +++ b/datafusion/functions-array/Cargo.toml @@ -49,6 +49,7 @@ datafusion-functions = { workspace = true } itertools = { version = "0.12", features = ["use_std"] } log = { workspace = true } paste = "1.0.14" +once_cell = "1.19.0" [dev-dependencies] criterion = { version = "0.5", features = ["async_tokio"] } diff --git a/datafusion/functions-array/src/macros.rs b/datafusion/functions-array/src/macros.rs index c49f5830b8d5..e637ec7a898e 100644 --- a/datafusion/functions-array/src/macros.rs +++ b/datafusion/functions-array/src/macros.rs @@ -68,6 +68,7 @@ macro_rules! make_udf_function { pub fn $SCALAR_UDF_FN() -> std::sync::Arc { [< STATIC_ $UDF >] .get_or_init(|| { + println!("Creating UDF"); std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( <$UDF>::new(), )) diff --git a/datafusion/physical-expr-common/src/aggregate/mod.rs b/datafusion/physical-expr-common/src/aggregate/mod.rs new file mode 100644 index 000000000000..244e52b8b0b7 --- /dev/null +++ b/datafusion/physical-expr-common/src/aggregate/mod.rs @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod utils; + +use arrow::datatypes::{DataType, Field, Schema}; +use datafusion_common::{not_impl_err, Result}; +// use datafusion_execution::FunctionRegistry; +use datafusion_expr::{ + function::AccumulatorArgs, Accumulator, AggregateUDF, Expr, GroupsAccumulator, +}; +// use datafusion_physical_expr_common::{ +// physical_expr::PhysicalExpr, +// sort_expr::{LexOrdering, PhysicalSortExpr}, +// }; +use std::fmt::Debug; +use std::{any::Any, sync::Arc}; + +use crate::physical_expr::PhysicalExpr; +use crate::sort_expr::{LexOrdering, PhysicalSortExpr}; + +use self::utils::{down_cast_any_ref, ordering_fields}; + +/// Creates a physical expression of the UDAF, that includes all necessary type coercion. +/// This function errors when `args`' can't be coerced to a valid argument type of the UDAF. +pub fn create_aggregate_expr( + fun: &AggregateUDF, + input_phy_exprs: &[Arc], + sort_exprs: &[Expr], + ordering_req: &[PhysicalSortExpr], + schema: &Schema, + name: impl Into, + ignore_nulls: bool, +) -> Result> { + let input_exprs_types = input_phy_exprs + .iter() + .map(|arg| arg.data_type(schema)) + .collect::>>()?; + + let ordering_types = ordering_req + .iter() + .map(|e| e.expr.data_type(schema)) + .collect::>>()?; + + let ordering_fields = ordering_fields(ordering_req, &ordering_types); + + Ok(Arc::new(AggregateFunctionExpr { + fun: fun.clone(), + args: input_phy_exprs.to_vec(), + data_type: fun.return_type(&input_exprs_types)?, + name: name.into(), + schema: schema.clone(), + sort_exprs: sort_exprs.to_vec(), + ordering_req: ordering_req.to_vec(), + ignore_nulls, + ordering_fields, + })) +} + +/// An aggregate expression that: +/// * knows its resulting field +/// * knows how to create its accumulator +/// * knows its accumulator's state's field +/// * knows the expressions from whose its accumulator will receive values +/// +/// Any implementation of this trait also needs to implement the +/// `PartialEq` to allows comparing equality between the +/// trait objects. +pub trait AggregateExpr: Send + Sync + Debug + PartialEq { + /// Returns the aggregate expression as [`Any`] so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + /// the field of the final result of this aggregation. + fn field(&self) -> Result; + + /// the accumulator used to accumulate values from the expressions. + /// the accumulator expects the same number of arguments as `expressions` and must + /// return states with the same description as `state_fields` + fn create_accumulator(&self) -> Result>; + + /// the fields that encapsulate the Accumulator's state + /// the number of fields here equals the number of states that the accumulator contains + fn state_fields(&self) -> Result>; + + /// expressions that are passed to the Accumulator. + /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. + fn expressions(&self) -> Vec>; + + /// Order by requirements for the aggregate function + /// By default it is `None` (there is no requirement) + /// Order-sensitive aggregators, such as `FIRST_VALUE(x ORDER BY y)` should implement this + fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + None + } + + /// Human readable name such as `"MIN(c2)"`. The default + /// implementation returns placeholder text. + fn name(&self) -> &str { + "AggregateExpr: default name" + } + + /// If the aggregate expression has a specialized + /// [`GroupsAccumulator`] implementation. If this returns true, + /// `[Self::create_groups_accumulator`] will be called. + fn groups_accumulator_supported(&self) -> bool { + false + } + + /// Return a specialized [`GroupsAccumulator`] that manages state + /// for all groups. + /// + /// For maximum performance, a [`GroupsAccumulator`] should be + /// implemented in addition to [`Accumulator`]. + fn create_groups_accumulator(&self) -> Result> { + not_impl_err!("GroupsAccumulator hasn't been implemented for {self:?} yet") + } + + /// Construct an expression that calculates the aggregate in reverse. + /// Typically the "reverse" expression is itself (e.g. SUM, COUNT). + /// For aggregates that do not support calculation in reverse, + /// returns None (which is the default value). + fn reverse_expr(&self) -> Option> { + None + } + + /// Creates accumulator implementation that supports retract + fn create_sliding_accumulator(&self) -> Result> { + not_impl_err!("Retractable Accumulator hasn't been implemented for {self:?} yet") + } +} + +/// Physical aggregate expression of a UDAF. +#[derive(Debug)] +pub struct AggregateFunctionExpr { + fun: AggregateUDF, + args: Vec>, + /// Output / return type of this aggregate + data_type: DataType, + name: String, + schema: Schema, + // The logical order by expressions + sort_exprs: Vec, + // The physical order by expressions + ordering_req: LexOrdering, + ignore_nulls: bool, + ordering_fields: Vec, +} + +impl AggregateFunctionExpr { + /// Return the `AggregateUDF` used by this `AggregateFunctionExpr` + pub fn fun(&self) -> &AggregateUDF { + &self.fun + } +} + +impl AggregateExpr for AggregateFunctionExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn expressions(&self) -> Vec> { + self.args.clone() + } + + fn state_fields(&self) -> Result> { + self.fun.state_fields( + self.name(), + self.data_type.clone(), + self.ordering_fields.clone(), + ) + } + + fn field(&self) -> Result { + Ok(Field::new(&self.name, self.data_type.clone(), true)) + } + + fn create_accumulator(&self) -> Result> { + let acc_args = AccumulatorArgs::new( + &self.data_type, + &self.schema, + self.ignore_nulls, + &self.sort_exprs, + ); + + self.fun.accumulator(acc_args) + } + + fn create_sliding_accumulator(&self) -> Result> { + let accumulator = self.create_accumulator()?; + + // Accumulators that have window frame startings different + // than `UNBOUNDED PRECEDING`, such as `1 PRECEEDING`, need to + // implement retract_batch method in order to run correctly + // currently in DataFusion. + // + // If this `retract_batches` is not present, there is no way + // to calculate result correctly. For example, the query + // + // ```sql + // SELECT + // SUM(a) OVER(ORDER BY a ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS sum_a + // FROM + // t + // ``` + // + // 1. First sum value will be the sum of rows between `[0, 1)`, + // + // 2. Second sum value will be the sum of rows between `[0, 2)` + // + // 3. Third sum value will be the sum of rows between `[1, 3)`, etc. + // + // Since the accumulator keeps the running sum: + // + // 1. First sum we add to the state sum value between `[0, 1)` + // + // 2. Second sum we add to the state sum value between `[1, 2)` + // (`[0, 1)` is already in the state sum, hence running sum will + // cover `[0, 2)` range) + // + // 3. Third sum we add to the state sum value between `[2, 3)` + // (`[0, 2)` is already in the state sum). Also we need to + // retract values between `[0, 1)` by this way we can obtain sum + // between [1, 3) which is indeed the apropriate range. + // + // When we use `UNBOUNDED PRECEDING` in the query starting + // index will always be 0 for the desired range, and hence the + // `retract_batch` method will not be called. In this case + // having retract_batch is not a requirement. + // + // This approach is a a bit different than window function + // approach. In window function (when they use a window frame) + // they get all the desired range during evaluation. + if !accumulator.supports_retract_batch() { + return not_impl_err!( + "Aggregate can not be used as a sliding accumulator because \ + `retract_batch` is not implemented: {}", + self.name + ); + } + Ok(accumulator) + } + + fn name(&self) -> &str { + &self.name + } + + fn groups_accumulator_supported(&self) -> bool { + self.fun.groups_accumulator_supported() + } + + fn create_groups_accumulator(&self) -> Result> { + self.fun.create_groups_accumulator() + } + + fn order_bys(&self) -> Option<&[PhysicalSortExpr]> { + (!self.ordering_req.is_empty()).then_some(&self.ordering_req) + } +} + +impl PartialEq for AggregateFunctionExpr { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| { + self.name == x.name + && self.data_type == x.data_type + && self.fun == x.fun + && self.args.len() == x.args.len() + && self + .args + .iter() + .zip(x.args.iter()) + .all(|(this_arg, other_arg)| this_arg.eq(other_arg)) + }) + .unwrap_or(false) + } +} diff --git a/datafusion/functions-aggregate/src/utils.rs b/datafusion/physical-expr-common/src/aggregate/utils.rs similarity index 96% rename from datafusion/functions-aggregate/src/utils.rs rename to datafusion/physical-expr-common/src/aggregate/utils.rs index 989db83cf4f6..9821ba626b18 100644 --- a/datafusion/functions-aggregate/src/utils.rs +++ b/datafusion/physical-expr-common/src/aggregate/utils.rs @@ -21,9 +21,10 @@ use arrow::{ compute::SortOptions, datatypes::{DataType, Field}, }; -use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; -use crate::AggregateExpr; +use crate::sort_expr::PhysicalSortExpr; + +use super::AggregateExpr; /// Downcast a `Box` or `Arc` /// and return the inner trait object as [`Any`] so diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index 3fe11a246b32..53e3134a1b05 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +pub mod aggregate; pub mod expressions; pub mod physical_expr; pub mod sort_expr; diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 40eeadfa4199..eff008e8f825 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use crate::expressions::{NthValueAgg, OrderSensitiveArrayAgg}; -pub use datafusion_functions_aggregate::AggregateExpr; +pub use datafusion_physical_expr_common::aggregate::AggregateExpr; mod hyperloglog; mod tdigest; diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs index 51b1363183f3..6d97ad3da6de 100644 --- a/datafusion/physical-expr/src/aggregate/utils.rs +++ b/datafusion/physical-expr/src/aggregate/utils.rs @@ -20,9 +20,9 @@ use std::sync::Arc; // For backwards compatibility -pub use datafusion_functions_aggregate::utils::down_cast_any_ref; -pub use datafusion_functions_aggregate::utils::get_sort_options; -pub use datafusion_functions_aggregate::utils::ordering_fields; +pub use datafusion_physical_expr_common::aggregate::utils::{ + down_cast_any_ref, get_sort_options, ordering_fields, +}; use arrow::array::{ArrayRef, ArrowNativeTypeOp}; use arrow_array::cast::AsArray; diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index 65882c23b694..7b81e8f8a5c4 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -41,7 +41,7 @@ pub mod execution_props { pub use aggregate::groups_accumulator::{GroupsAccumulatorAdapter, NullState}; pub use analysis::{analyze, AnalysisContext, ExprBoundaries}; -pub use datafusion_functions_aggregate::AggregateExpr; +pub use datafusion_physical_expr_common::aggregate::AggregateExpr; pub use equivalence::EquivalenceProperties; pub use partitioning::{Distribution, Partitioning}; pub use physical_expr::{ diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index b76c557066d8..cedf55fd35c9 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -47,6 +47,7 @@ datafusion-common-runtime = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-functions-aggregate = { workspace = true } +datafusion-physical-expr-common = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } futures = { workspace = true } half = { workspace = true } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 59166365b91d..e1c8489655bf 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -91,8 +91,9 @@ pub use datafusion_physical_expr::{ pub use crate::stream::EmptyRecordBatchStream; pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; pub mod udaf { - pub use datafusion_functions_aggregate::create_aggregate_expr; - pub use datafusion_functions_aggregate::AggregateFunctionExpr; + pub use datafusion_physical_expr_common::aggregate::{ + create_aggregate_expr, AggregateFunctionExpr, + }; } /// Represent nodes in the DataFusion Physical Plan. From 38b2ce7234a85a1e6adee684903c358db8925c21 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 6 Apr 2024 20:08:10 +0800 Subject: [PATCH 35/38] taplo Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 2 ++ datafusion/functions-array/Cargo.toml | 2 +- datafusion/physical-plan/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index b2a7c425f579..3acb39ca9a68 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1308,6 +1308,7 @@ dependencies = [ "datafusion-functions", "itertools", "log", + "once_cell", "paste", ] @@ -1387,6 +1388,7 @@ dependencies = [ "datafusion-expr", "datafusion-functions-aggregate", "datafusion-physical-expr", + "datafusion-physical-expr-common", "futures", "half", "hashbrown 0.14.3", diff --git a/datafusion/functions-array/Cargo.toml b/datafusion/functions-array/Cargo.toml index 8257f9db8c46..34d1ed9a8b04 100644 --- a/datafusion/functions-array/Cargo.toml +++ b/datafusion/functions-array/Cargo.toml @@ -48,8 +48,8 @@ datafusion-expr = { workspace = true } datafusion-functions = { workspace = true } itertools = { version = "0.12", features = ["use_std"] } log = { workspace = true } -paste = "1.0.14" once_cell = "1.19.0" +paste = "1.0.14" [dev-dependencies] criterion = { version = "0.5", features = ["async_tokio"] } diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index cedf55fd35c9..6a78bd596a46 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -47,8 +47,8 @@ datafusion-common-runtime = { workspace = true, default-features = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-functions-aggregate = { workspace = true } -datafusion-physical-expr-common = { workspace = true } datafusion-physical-expr = { workspace = true, default-features = true } +datafusion-physical-expr-common = { workspace = true } futures = { workspace = true } half = { workspace = true } hashbrown = { version = "0.14", features = ["raw"] } From 9c7767cdefba51c0304d90a722b60b9ae9bcf880 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 7 Apr 2024 08:37:46 +0800 Subject: [PATCH 36/38] rm comment Signed-off-by: jayzhan211 --- datafusion/functions-array/src/macros.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/functions-array/src/macros.rs b/datafusion/functions-array/src/macros.rs index e637ec7a898e..c49f5830b8d5 100644 --- a/datafusion/functions-array/src/macros.rs +++ b/datafusion/functions-array/src/macros.rs @@ -68,7 +68,6 @@ macro_rules! make_udf_function { pub fn $SCALAR_UDF_FN() -> std::sync::Arc { [< STATIC_ $UDF >] .get_or_init(|| { - println!("Creating UDF"); std::sync::Arc::new(datafusion_expr::ScalarUDF::new_from_impl( <$UDF>::new(), )) From ea4addebb1c837dc5be38f353c810e296066144d Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 7 Apr 2024 09:14:34 +0800 Subject: [PATCH 37/38] cleanup Signed-off-by: jayzhan211 --- datafusion/functions-array/Cargo.toml | 1 - datafusion/physical-expr-common/src/aggregate/mod.rs | 5 ----- 2 files changed, 6 deletions(-) diff --git a/datafusion/functions-array/Cargo.toml b/datafusion/functions-array/Cargo.toml index 34d1ed9a8b04..6ef9c6b055af 100644 --- a/datafusion/functions-array/Cargo.toml +++ b/datafusion/functions-array/Cargo.toml @@ -48,7 +48,6 @@ datafusion-expr = { workspace = true } datafusion-functions = { workspace = true } itertools = { version = "0.12", features = ["use_std"] } log = { workspace = true } -once_cell = "1.19.0" paste = "1.0.14" [dev-dependencies] diff --git a/datafusion/physical-expr-common/src/aggregate/mod.rs b/datafusion/physical-expr-common/src/aggregate/mod.rs index 244e52b8b0b7..33044fd9beee 100644 --- a/datafusion/physical-expr-common/src/aggregate/mod.rs +++ b/datafusion/physical-expr-common/src/aggregate/mod.rs @@ -19,14 +19,9 @@ pub mod utils; use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::{not_impl_err, Result}; -// use datafusion_execution::FunctionRegistry; use datafusion_expr::{ function::AccumulatorArgs, Accumulator, AggregateUDF, Expr, GroupsAccumulator, }; -// use datafusion_physical_expr_common::{ -// physical_expr::PhysicalExpr, -// sort_expr::{LexOrdering, PhysicalSortExpr}, -// }; use std::fmt::Debug; use std::{any::Any, sync::Arc}; From 39c5d15d14dcd48e84d35b1359510ddc5ce0d588 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 7 Apr 2024 09:15:05 +0800 Subject: [PATCH 38/38] lock Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 3acb39ca9a68..a0a7b20ac40e 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1308,7 +1308,6 @@ dependencies = [ "datafusion-functions", "itertools", "log", - "once_cell", "paste", ]