diff --git a/README.md b/README.md index 5d0b096c1de1..30505d7ca132 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,9 @@ DataFusion is an extensible query engine written in [Rust] that uses [Apache Arrow] as its in-memory format. -The DataFusion libraries in this repository are used to build data-centric system software. DataFusion also provides the -following subprojects, which are packaged versions of DataFusion intended for end users. +This crate provides libraries and binaries for developers building fast and +feature rich database and analytic systems, customized to particular workloads. +See [use cases] for examples. The following related subprojects target end users: - [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame queries. @@ -54,13 +55,10 @@ following subprojects, which are packaged versions of DataFusion intended for en - [DataFusion Comet](https://github.com/apache/datafusion-comet/) is an accelerator for Apache Spark based on DataFusion. -The target audience for the DataFusion crates in this repository are -developers building fast and feature rich database and analytic systems, -customized to particular workloads. See [use cases] for examples. - -DataFusion offers [SQL] and [`Dataframe`] APIs, -excellent [performance], built-in support for CSV, Parquet, JSON, and Avro, -extensive customization, and a great community. +"Out of the box," +DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance], +built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and +a great community. DataFusion features a full query planner, a columnar, streaming, multi-threaded, vectorized execution engine, and partitioned data sources. You can diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index 36254192550c..422fcb5eb3e0 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -279,8 +279,88 @@ pub fn get_data_dir( } } +#[macro_export] +macro_rules! create_array { + (Boolean, $values: expr) => { + std::sync::Arc::new(arrow::array::BooleanArray::from($values)) + }; + (Int8, $values: expr) => { + std::sync::Arc::new(arrow::array::Int8Array::from($values)) + }; + (Int16, $values: expr) => { + std::sync::Arc::new(arrow::array::Int16Array::from($values)) + }; + (Int32, $values: expr) => { + std::sync::Arc::new(arrow::array::Int32Array::from($values)) + }; + (Int64, $values: expr) => { + std::sync::Arc::new(arrow::array::Int64Array::from($values)) + }; + (UInt8, $values: expr) => { + std::sync::Arc::new(arrow::array::UInt8Array::from($values)) + }; + (UInt16, $values: expr) => { + std::sync::Arc::new(arrow::array::UInt16Array::from($values)) + }; + (UInt32, $values: expr) => { + std::sync::Arc::new(arrow::array::UInt32Array::from($values)) + }; + (UInt64, $values: expr) => { + std::sync::Arc::new(arrow::array::UInt64Array::from($values)) + }; + (Float16, $values: expr) => { + std::sync::Arc::new(arrow::array::Float16Array::from($values)) + }; + (Float32, $values: expr) => { + std::sync::Arc::new(arrow::array::Float32Array::from($values)) + }; + (Float64, $values: expr) => { + std::sync::Arc::new(arrow::array::Float64Array::from($values)) + }; + (Utf8, $values: expr) => { + std::sync::Arc::new(arrow::array::StringArray::from($values)) + }; +} + +/// Creates a record batch from literal slice of values, suitable for rapid +/// testing and development. +/// +/// Example: +/// ``` +/// use datafusion_common::{record_batch, create_array}; +/// let batch = record_batch!( +/// ("a", Int32, vec![1, 2, 3]), +/// ("b", Float64, vec![Some(4.0), None, Some(5.0)]), +/// ("c", Utf8, vec!["alpha", "beta", "gamma"]) +/// ); +/// ``` +#[macro_export] +macro_rules! record_batch { + ($(($name: expr, $type: ident, $values: expr)),*) => { + { + let schema = std::sync::Arc::new(arrow_schema::Schema::new(vec![ + $( + arrow_schema::Field::new($name, arrow_schema::DataType::$type, true), + )* + ])); + + let batch = arrow_array::RecordBatch::try_new( + schema, + vec![$( + create_array!($type, $values), + )*] + ); + + batch + } + } +} + #[cfg(test)] mod tests { + use crate::cast::{as_float64_array, as_int32_array, as_string_array}; + use crate::error::Result; + use super::*; use std::env; @@ -333,4 +413,44 @@ mod tests { let res = parquet_test_data(); assert!(PathBuf::from(res).is_dir()); } + + #[test] + fn test_create_record_batch() -> Result<()> { + use arrow_array::Array; + + let batch = record_batch!( + ("a", Int32, vec![1, 2, 3, 4]), + ("b", Float64, vec![Some(4.0), None, Some(5.0), None]), + ("c", Utf8, vec!["alpha", "beta", "gamma", "delta"]) + )?; + + assert_eq!(3, batch.num_columns()); + assert_eq!(4, batch.num_rows()); + + let values: Vec<_> = as_int32_array(batch.column(0))? + .values() + .iter() + .map(|v| v.to_owned()) + .collect(); + assert_eq!(values, vec![1, 2, 3, 4]); + + let values: Vec<_> = as_float64_array(batch.column(1))? + .values() + .iter() + .map(|v| v.to_owned()) + .collect(); + assert_eq!(values, vec![4.0, 0.0, 5.0, 0.0]); + + let nulls: Vec<_> = as_float64_array(batch.column(1))? + .nulls() + .unwrap() + .iter() + .collect(); + assert_eq!(nulls, vec![true, false, true, false]); + + let values: Vec<_> = as_string_array(batch.column(2))?.iter().flatten().collect(); + assert_eq!(values, vec!["alpha", "beta", "gamma", "delta"]); + + Ok(()) + } } diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs index 53cfe94ecab3..d9415028c124 100644 --- a/datafusion/core/src/bin/print_functions_docs.rs +++ b/datafusion/core/src/bin/print_functions_docs.rs @@ -108,7 +108,7 @@ fn print_docs( .collect::>(); // write out section header - let _ = writeln!(docs, "## {} ", doc_section.label); + let _ = writeln!(docs, "\n## {} \n", doc_section.label); if let Some(description) = doc_section.description { let _ = writeln!(docs, "{description}"); diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index 24a4938e7b2b..3c2d1b0205d6 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -37,14 +37,14 @@ use crate::physical_planner::create_physical_sort_exprs; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; +use datafusion_catalog::Session; use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt}; use datafusion_execution::TaskContext; use datafusion_expr::dml::InsertOp; +use datafusion_expr::SortExpr; use datafusion_physical_plan::metrics::MetricsSet; use async_trait::async_trait; -use datafusion_catalog::Session; -use datafusion_expr::SortExpr; use futures::StreamExt; use log::debug; use parking_lot::Mutex; @@ -241,7 +241,7 @@ impl TableProvider for MemTable { ) }) .collect::>>()?; - exec = exec.with_sort_information(file_sort_order); + exec = exec.try_with_sort_information(file_sort_order)?; } Ok(Arc::new(exec)) diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index b0852501415e..64a7514ebd5e 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -395,7 +395,8 @@ async fn run_aggregate_test(input1: Vec, group_by_columns: Vec<&str let running_source = Arc::new( MemoryExec::try_new(&[input1.clone()], schema.clone(), None) .unwrap() - .with_sort_information(vec![sort_keys]), + .try_with_sort_information(vec![sort_keys]) + .unwrap(), ); let aggregate_expr = diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index 0cd702372f7c..a72affc2b079 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -358,7 +358,8 @@ mod sp_repartition_fuzz_tests { let running_source = Arc::new( MemoryExec::try_new(&[input1.clone()], schema.clone(), None) .unwrap() - .with_sort_information(vec![sort_keys.clone()]), + .try_with_sort_information(vec![sort_keys.clone()]) + .unwrap(), ); let hash_exprs = vec![col("c", &schema).unwrap()]; diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index b9881c9f23cf..feffb11bf700 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -647,7 +647,7 @@ async fn run_window_test( ]; let mut exec1 = Arc::new( MemoryExec::try_new(&[vec![concat_input_record]], schema.clone(), None)? - .with_sort_information(vec![source_sort_keys.clone()]), + .try_with_sort_information(vec![source_sort_keys.clone()])?, ) as _; // Table is ordered according to ORDER BY a, b, c In linear test we use PARTITION BY b, ORDER BY a // For WindowAggExec to produce correct result it need table to be ordered by b,a. Hence add a sort. @@ -673,7 +673,7 @@ async fn run_window_test( )?) as _; let exec2 = Arc::new( MemoryExec::try_new(&[input1.clone()], schema.clone(), None)? - .with_sort_information(vec![source_sort_keys.clone()]), + .try_with_sort_information(vec![source_sort_keys.clone()])?, ); let running_window_exec = Arc::new(BoundedWindowAggExec::try_new( vec![create_window_expr( diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index ec66df45c7ba..fc2fb9afb5f9 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -840,7 +840,7 @@ impl TableProvider for SortedTableProvider { ) -> Result> { let mem_exec = MemoryExec::try_new(&self.batches, self.schema(), projection.cloned())? - .with_sort_information(self.sort_information.clone()); + .try_with_sort_information(self.sort_information.clone())?; Ok(Arc::new(mem_exec)) } diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index e8245588d945..e0ce7526036e 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -155,7 +155,7 @@ impl DocumentationBuilder { /// /// ```text /// : - /// expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + /// expression to operate on. Can be a constant, column, or function, and any combination of operators. /// ``` pub fn with_standard_argument( self, diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index cf8217fe981d..efa9a6d8daad 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -31,13 +31,17 @@ use datafusion_common::ScalarValue; use datafusion_common::{ downcast_value, internal_err, not_impl_err, DataFusionError, Result, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, +}; use std::any::Any; use std::fmt::{Debug, Formatter}; use std::hash::Hash; use std::marker::PhantomData; +use std::sync::OnceLock; make_udaf_expr_and_func!( ApproxDistinct, approx_distinct, @@ -303,4 +307,33 @@ impl AggregateUDFImpl for ApproxDistinct { }; Ok(accumulator) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_distinct_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_distinct_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.", + ) + .with_syntax_example("approx_distinct(expression)") + .with_sql_example(r#"```sql +> SELECT approx_distinct(column_name) FROM table_name; ++-----------------------------------+ +| approx_distinct(column_name) | ++-----------------------------------+ +| 42 | ++-----------------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs index 7a7b12432544..dd5bb8d441ed 100644 --- a/datafusion/functions-aggregate/src/approx_median.rs +++ b/datafusion/functions-aggregate/src/approx_median.rs @@ -19,15 +19,19 @@ use std::any::Any; use std::fmt::Debug; +use std::sync::OnceLock; use arrow::{datatypes::DataType, datatypes::Field}; use arrow_schema::DataType::{Float64, UInt64}; use datafusion_common::{not_impl_err, plan_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, +}; use crate::approx_percentile_cont::ApproxPercentileAccumulator; @@ -116,4 +120,33 @@ impl AggregateUDFImpl for ApproxMedian { acc_args.exprs[0].data_type(acc_args.schema)?, ))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_median_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_median_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.", + ) + .with_syntax_example("approx_median(expression)") + .with_sql_example(r#"```sql +> SELECT approx_median(column_name) FROM table_name; ++-----------------------------------+ +| approx_median(column_name) | ++-----------------------------------+ +| 23.5 | ++-----------------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 5578aebbf403..b4488d6d9e61 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{Array, RecordBatch}; use arrow::compute::{filter, is_not_null}; @@ -34,12 +34,13 @@ use datafusion_common::{ downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, ColumnarValue, Expr, Signature, TypeSignature, - Volatility, + Accumulator, AggregateUDFImpl, ColumnarValue, Documentation, Expr, Signature, + TypeSignature, Volatility, }; use datafusion_functions_aggregate_common::tdigest::{ TDigest, TryIntoF64, DEFAULT_MAX_SIZE, @@ -268,6 +269,36 @@ impl AggregateUDFImpl for ApproxPercentileCont { } Ok(arg_types[0].clone()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_percentile_cont_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_percentile_cont_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the approximate percentile of input values using the t-digest algorithm.", + ) + .with_syntax_example("approx_percentile_cont(expression, percentile, centroids)") + .with_sql_example(r#"```sql +> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; ++-------------------------------------------------+ +| approx_percentile_cont(column_name, 0.75, 100) | ++-------------------------------------------------+ +| 65.0 | ++-------------------------------------------------+ +```"#) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).") + .with_argument("centroids", "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index fee67ba1623d..8cbf9587a75a 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::{ array::ArrayRef, @@ -26,10 +26,13 @@ use arrow::{ use datafusion_common::ScalarValue; use datafusion_common::{not_impl_err, plan_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::Volatility::Immutable; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, TypeSignature}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, +}; use datafusion_functions_aggregate_common::tdigest::{ Centroid, TDigest, DEFAULT_MAX_SIZE, }; @@ -151,6 +154,37 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight { fn state_fields(&self, args: StateFieldsArgs) -> Result> { self.approx_percentile_cont.state_fields(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_approx_percentile_cont_with_weight_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_approx_percentile_cont_with_weight_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_APPROXIMATE) + .with_description( + "Returns the weighted approximate percentile of input values using the t-digest algorithm.", + ) + .with_syntax_example("approx_percentile_cont_with_weight(expression, weight, percentile)") + .with_sql_example(r#"```sql +> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; ++----------------------------------------------------------------------+ +| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | ++----------------------------------------------------------------------+ +| 78.5 | ++----------------------------------------------------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("weight", "Expression to use as weight. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 15146fc4a2d8..b44c4e6874ef 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -25,15 +25,16 @@ use datafusion_common::cast::as_list_array; use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx}; use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{internal_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; -use datafusion_expr::AggregateUDFImpl; use datafusion_expr::{Accumulator, Signature, Volatility}; +use datafusion_expr::{AggregateUDFImpl, Documentation}; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; use datafusion_functions_aggregate_common::utils::ordering_fields; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use std::collections::{HashSet, VecDeque}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; make_udaf_expr_and_func!( ArrayAgg, @@ -142,6 +143,35 @@ impl AggregateUDFImpl for ArrayAgg { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Reversed(array_agg_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_array_agg_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_array_agg_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns an array created from the expression elements. If ordering is required, elements are inserted in the specified order.", + ) + .with_syntax_example("array_agg(expression [ORDER BY expression])") + .with_sql_example(r#"```sql +> SELECT array_agg(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| array_agg(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| [element1, element2, element3] | ++-----------------------------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs index ddad76a8734b..ad58eecdf949 100644 --- a/datafusion/functions-aggregate/src/average.rs +++ b/datafusion/functions-aggregate/src/average.rs @@ -28,12 +28,14 @@ use arrow::datatypes::{ Float64Type, UInt64Type, }; use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{avg_return_type, coerce_avg_type}; use datafusion_expr::utils::format_state_name; use datafusion_expr::Volatility::Immutable; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, EmitTo, GroupsAccumulator, ReversedUDAF, Signature, + Accumulator, AggregateUDFImpl, Documentation, EmitTo, GroupsAccumulator, + ReversedUDAF, Signature, }; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::accumulate::NullState; @@ -45,7 +47,7 @@ use datafusion_functions_aggregate_common::utils::DecimalAverager; use log::debug; use std::any::Any; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; make_udaf_expr_and_func!( Avg, @@ -235,6 +237,36 @@ impl AggregateUDFImpl for Avg { } coerce_avg_type(self.name(), arg_types) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_avg_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_avg_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the average of numeric values in the specified column.", + ) + .with_syntax_example("avg(expression)") + .with_sql_example(r#"```sql +> SELECT avg(column_name) FROM table_name; ++---------------------------+ +| avg(column_name) | ++---------------------------+ +| 42.75 | ++---------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("Aliases: ", "`mean`") + .build() + .unwrap() + }) } /// An accumulator to compute the average diff --git a/datafusion/functions-aggregate/src/bool_and_or.rs b/datafusion/functions-aggregate/src/bool_and_or.rs index 7cc7d9ff7fec..e212ba8d6172 100644 --- a/datafusion/functions-aggregate/src/bool_and_or.rs +++ b/datafusion/functions-aggregate/src/bool_and_or.rs @@ -18,6 +18,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution use std::any::Any; +use std::sync::OnceLock; use arrow::array::ArrayRef; use arrow::array::BooleanArray; @@ -29,10 +30,12 @@ use arrow::datatypes::Field; use datafusion_common::internal_err; use datafusion_common::{downcast_value, not_impl_err}; use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF, + Signature, Volatility, }; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::bool_op::BooleanGroupsAccumulator; @@ -172,6 +175,34 @@ impl AggregateUDFImpl for BoolAnd { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_bool_and_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_bool_and_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns true if all non-null input values are true, otherwise false.", + ) + .with_syntax_example("bool_and(expression)") + .with_sql_example(r#"```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +```"#) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } #[derive(Debug, Default)] @@ -293,6 +324,32 @@ impl AggregateUDFImpl for BoolOr { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_bool_or_doc()) + } +} + +fn get_bool_or_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns true if any non-null input value is true, otherwise false.", + ) + .with_syntax_example("bool_or(expression)") + .with_sql_example(r#"```sql +> SELECT bool_or(column_name) FROM table_name; ++----------------------------+ +| bool_or(column_name) | ++----------------------------+ +| true | ++----------------------------+ +```"#) + .with_standard_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } #[derive(Debug, Default)] diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs index 88f01b06d2d9..60be3608e99e 100644 --- a/datafusion/functions-aggregate/src/correlation.rs +++ b/datafusion/functions-aggregate/src/correlation.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::compute::{and, filter, is_not_null}; use arrow::{ @@ -30,11 +30,12 @@ use arrow::{ use crate::covariance::CovarianceAccumulator; use crate::stddev::StddevAccumulator; use datafusion_common::{plan_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, type_coercion::aggregates::NUMERICS, utils::format_state_name, - Accumulator, AggregateUDFImpl, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; @@ -107,6 +108,35 @@ impl AggregateUDFImpl for Correlation { ), ]) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_corr_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_corr_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the coefficient of correlation between two numeric values.", + ) + .with_syntax_example("corr(expression1, expression2)") + .with_sql_example(r#"```sql +> SELECT corr(column1, column2) FROM table_name; ++--------------------------------+ +| corr(column1, column2) | ++--------------------------------+ +| 0.85 | ++--------------------------------+ +```"#) + .with_argument("expression1", "First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("expression2", "Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } /// An accumulator to compute correlation diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index cc245b3572ec..23dd5b65bf82 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -20,8 +20,9 @@ use datafusion_common::stats::Precision; use datafusion_functions_aggregate_common::aggregate::count_distinct::BytesViewDistinctCountAccumulator; use datafusion_physical_expr::expressions; use std::collections::HashSet; +use std::fmt::Debug; use std::ops::BitAnd; -use std::{fmt::Debug, sync::Arc}; +use std::sync::{Arc, OnceLock}; use arrow::{ array::{ArrayRef, AsArray}, @@ -43,10 +44,11 @@ use arrow::{ use datafusion_common::{ downcast_value, internal_err, not_impl_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, - EmitTo, GroupsAccumulator, Signature, Volatility, + Documentation, EmitTo, GroupsAccumulator, Signature, Volatility, }; use datafusion_expr::{Expr, ReversedUDAF, StatisticsArgs, TypeSignature}; use datafusion_functions_aggregate_common::aggregate::count_distinct::{ @@ -324,6 +326,41 @@ impl AggregateUDFImpl for Count { } None } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_count_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_count_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`.", + ) + .with_syntax_example("count(expression)") + .with_sql_example(r#"```sql +> SELECT count(column_name) FROM table_name; ++-----------------------+ +| count(column_name) | ++-----------------------+ +| 100 | ++-----------------------+ + +> SELECT count(*) FROM table_name; ++------------------+ +| count(*) | ++------------------+ +| 120 | ++------------------+ +```"#) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/covariance.rs b/datafusion/functions-aggregate/src/covariance.rs index d0abb079ef15..c599b58ed207 100644 --- a/datafusion/functions-aggregate/src/covariance.rs +++ b/datafusion/functions-aggregate/src/covariance.rs @@ -18,6 +18,7 @@ //! [`CovarianceSample`]: covariance sample aggregations. use std::fmt::Debug; +use std::sync::OnceLock; use arrow::{ array::{ArrayRef, Float64Array, UInt64Array}, @@ -29,11 +30,12 @@ use datafusion_common::{ downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, type_coercion::aggregates::NUMERICS, utils::format_state_name, - Accumulator, AggregateUDFImpl, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; @@ -124,6 +126,36 @@ impl AggregateUDFImpl for CovarianceSample { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_covar_samp_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_covar_samp_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the sample covariance of a set of number pairs.", + ) + .with_syntax_example("covar_samp(expression1, expression2)") + .with_sql_example(r#"```sql +> SELECT covar_samp(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_samp(column1, column2) | ++-----------------------------------+ +| 8.25 | ++-----------------------------------+ +```"#, + ) + .with_argument("expression1", "First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("expression2", "Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } pub struct CovariancePopulation { @@ -193,6 +225,34 @@ impl AggregateUDFImpl for CovariancePopulation { StatsType::Population, )?)) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_covar_pop_doc()) + } +} + +fn get_covar_pop_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the population covariance of a set of number pairs.", + ) + .with_syntax_example("covar_pop(expression1, expression2)") + .with_sql_example(r#"```sql +> SELECT covar_pop(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_pop(column1, column2) | ++-----------------------------------+ +| 7.63 | ++-----------------------------------+ +```"#, + ) + .with_argument("expression1", "First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("expression2", "Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } /// An accumulator to compute covariance diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index 41ac7875795d..02b8c522823e 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, AsArray, BooleanArray}; use arrow::compute::{self, lexsort_to_indices, SortColumn}; @@ -28,11 +28,12 @@ use datafusion_common::utils::{compare_rows, get_row_at_idx, take_arrays}; use datafusion_common::{ arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Expr, ExprFunctionExt, - Signature, SortExpr, TypeSignature, Volatility, + Accumulator, AggregateUDFImpl, ArrayFunctionSignature, Documentation, Expr, + ExprFunctionExt, Signature, SortExpr, TypeSignature, Volatility, }; use datafusion_functions_aggregate_common::utils::get_sort_options; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; @@ -165,6 +166,35 @@ impl AggregateUDFImpl for FirstValue { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Reversed(last_value_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_first_value_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_first_value_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", + ) + .with_syntax_example("first_value(expression [ORDER BY expression])") + .with_sql_example(r#"```sql +> SELECT first_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| first_value(column_name ORDER BY other_column)| ++-----------------------------------------------+ +| first_element | ++-----------------------------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } #[derive(Debug)] @@ -466,6 +496,33 @@ impl AggregateUDFImpl for LastValue { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Reversed(first_value_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_last_value_doc()) + } +} + +fn get_last_value_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", + ) + .with_syntax_example("last_value(expression [ORDER BY expression])") + .with_sql_example(r#"```sql +> SELECT last_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| last_value(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| last_element | ++-----------------------------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/grouping.rs b/datafusion/functions-aggregate/src/grouping.rs index 6fb7c3800f4e..09e9b90b2e6d 100644 --- a/datafusion/functions-aggregate/src/grouping.rs +++ b/datafusion/functions-aggregate/src/grouping.rs @@ -19,14 +19,18 @@ use std::any::Any; use std::fmt; +use std::sync::OnceLock; use arrow::datatypes::DataType; use arrow::datatypes::Field; use datafusion_common::{not_impl_err, Result}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::utils::format_state_name; -use datafusion_expr::{Accumulator, AggregateUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, +}; make_udaf_expr_and_func!( Grouping, @@ -94,4 +98,37 @@ impl AggregateUDFImpl for Grouping { "physical plan is not yet implemented for GROUPING aggregate function" ) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_grouping_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_grouping_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns 1 if the data is aggregated across the specified column, or 0 if it is not aggregated in the result set.", + ) + .with_syntax_example("grouping(expression)") + .with_sql_example(r#"```sql +> SELECT column_name, GROUPING(column_name) AS group_column + FROM table_name + GROUP BY GROUPING SETS ((column_name), ()); ++-------------+-------------+ +| column_name | group_column | ++-------------+-------------+ +| value1 | 0 | +| value2 | 0 | +| NULL | 1 | ++-------------+-------------+ +```"#, + ) + .with_argument("expression", "Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index 7dd0de14c3c0..8eb17db1eca9 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -16,8 +16,8 @@ // under the License. use std::collections::HashSet; -use std::fmt::Formatter; -use std::{fmt::Debug, sync::Arc}; +use std::fmt::{Debug, Formatter}; +use std::sync::{Arc, OnceLock}; use arrow::array::{downcast_integer, ArrowNumericType}; use arrow::{ @@ -33,10 +33,11 @@ use arrow::array::ArrowNativeTypeOp; use arrow::datatypes::ArrowNativeType; use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, - Signature, Volatility, + Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::utils::Hashable; @@ -152,6 +153,35 @@ impl AggregateUDFImpl for Median { fn aliases(&self) -> &[String] { &[] } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_median_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_median_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the median value in the specified column.", + ) + .with_syntax_example("median(expression)") + .with_sql_example(r#"```sql +> SELECT median(column_name) FROM table_name; ++----------------------+ +| median(column_name) | ++----------------------+ +| 45.5 | ++----------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } /// The median accumulator accumulates the raw input values diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index 26ba97f505fd..2f7954a8ee02 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -40,6 +40,7 @@ use datafusion_common::stats::Precision; use datafusion_common::{ downcast_value, exec_err, internal_err, ColumnStatistics, DataFusionError, Result, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use datafusion_physical_expr::expressions; use std::fmt::Debug; @@ -54,11 +55,13 @@ use arrow::datatypes::{ use crate::min_max::min_max_bytes::MinMaxBytesAccumulator; use datafusion_common::ScalarValue; use datafusion_expr::{ - function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Signature, Volatility, + function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Documentation, Signature, + Volatility, }; use datafusion_expr::{GroupsAccumulator, StatisticsArgs}; use half::f16; use std::ops::Deref; +use std::sync::OnceLock; fn get_min_max_result_type(input_types: &[DataType]) -> Result> { // make sure that the input types only has one element. @@ -330,6 +333,35 @@ impl AggregateUDFImpl for Max { fn value_from_stats(&self, statistics_args: &StatisticsArgs) -> Option { self.value_from_statistics(statistics_args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_max_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_max_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the maximum value in the specified column.", + ) + .with_syntax_example("max(expression)") + .with_sql_example(r#"```sql +> SELECT max(column_name) FROM table_name; ++----------------------+ +| max(column_name) | ++----------------------+ +| 150 | ++----------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } // Statically-typed version of min/max(array) -> ScalarValue for string types @@ -1134,7 +1166,35 @@ impl AggregateUDFImpl for Min { fn reverse_expr(&self) -> datafusion_expr::ReversedUDAF { datafusion_expr::ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_min_doc()) + } +} + +fn get_min_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the minimum value in the specified column.", + ) + .with_syntax_example("min(expression)") + .with_sql_example(r#"```sql +> SELECT min(column_name) FROM table_name; ++----------------------+ +| min(column_name) | ++----------------------+ +| 12 | ++----------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } + /// An accumulator to compute the minimum value #[derive(Debug)] pub struct MinAccumulator { diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index bbfe56914c91..6d8cea8f0531 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -20,18 +20,19 @@ use std::any::Any; use std::collections::VecDeque; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{new_empty_array, ArrayRef, AsArray, StructArray}; use arrow_schema::{DataType, Field, Fields}; use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx}; use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - lit, Accumulator, AggregateUDFImpl, ExprFunctionExt, ReversedUDAF, Signature, - SortExpr, Volatility, + lit, Accumulator, AggregateUDFImpl, Documentation, ExprFunctionExt, ReversedUDAF, + Signature, SortExpr, Volatility, }; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; use datafusion_functions_aggregate_common::utils::ordering_fields; @@ -161,6 +162,40 @@ impl AggregateUDFImpl for NthValueAgg { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Reversed(nth_value_udaf()) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nth_value_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nth_value_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the nth value in a group of values.", + ) + .with_syntax_example("nth_value(expression, n ORDER BY expression)") + .with_sql_example(r#"```sql +> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id ORDER BY salary ASC) AS second_salary_by_dept + FROM employee; ++---------+--------+-------------------------+ +| dept_id | salary | second_salary_by_dept | ++---------+--------+-------------------------+ +| 1 | 30000 | NULL | +| 1 | 40000 | 40000 | +| 1 | 50000 | 40000 | +| 2 | 35000 | NULL | +| 2 | 45000 | 45000 | ++---------+--------+-------------------------+ +```"#) + .with_standard_argument("expression", "The column or expression to retrieve the nth value from.") + .with_argument("n", "The position (nth) of the value to retrieve, based on the ordering.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/stddev.rs b/datafusion/functions-aggregate/src/stddev.rs index a25ab5e31991..9f9da0c585fc 100644 --- a/datafusion/functions-aggregate/src/stddev.rs +++ b/datafusion/functions-aggregate/src/stddev.rs @@ -19,17 +19,19 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::Float64Array; use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field}; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_common::{plan_err, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature, + Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; @@ -132,6 +134,35 @@ impl AggregateUDFImpl for Stddev { ) -> Result> { Ok(Box::new(StddevGroupsAccumulator::new(StatsType::Sample))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_stddev_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_stddev_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the standard deviation of a set of numbers.", + ) + .with_syntax_example("stddev(expression)") + .with_sql_example(r#"```sql +> SELECT stddev(column_name) FROM table_name; ++----------------------+ +| stddev(column_name) | ++----------------------+ +| 12.34 | ++----------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } make_udaf_expr_and_func!( @@ -228,6 +259,33 @@ impl AggregateUDFImpl for StddevPop { StatsType::Population, ))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_stddev_pop_doc()) + } +} + +fn get_stddev_pop_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STATISTICAL) + .with_description( + "Returns the population standard deviation of a set of numbers.", + ) + .with_syntax_example("stddev_pop(expression)") + .with_sql_example(r#"```sql +> SELECT stddev_pop(column_name) FROM table_name; ++--------------------------+ +| stddev_pop(column_name) | ++--------------------------+ +| 10.56 | ++--------------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } /// An accumulator to compute the average diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs index a7e9a37e23ad..66fc19910696 100644 --- a/datafusion/functions-aggregate/src/string_agg.rs +++ b/datafusion/functions-aggregate/src/string_agg.rs @@ -22,12 +22,14 @@ use arrow_schema::DataType; use datafusion_common::cast::as_generic_string_array; use datafusion_common::Result; use datafusion_common::{not_impl_err, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, Signature, TypeSignature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, Volatility, }; use datafusion_physical_expr::expressions::Literal; use std::any::Any; +use std::sync::OnceLock; make_udaf_expr_and_func!( StringAgg, @@ -98,6 +100,37 @@ impl AggregateUDFImpl for StringAgg { not_impl_err!("expect literal") } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_string_agg_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_string_agg_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Concatenates the values of string expressions and places separator values between them." + ) + .with_syntax_example("string_agg(expression, delimiter)") + .with_sql_example(r#"```sql +> SELECT string_agg(name, ', ') AS names_list + FROM employee; ++--------------------------+ +| names_list | ++--------------------------+ +| Alice, Bob, Charlie | ++--------------------------+ +```"#, + ) + .with_argument("expression", "The string expression to concatenate. Can be a column or any valid string expression.") + .with_argument("delimiter", "A literal string used as a separator between the concatenated values.") + .build() + .unwrap() + }) } #[derive(Debug)] diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs index 7e40c1bd17a8..91e777dd2a87 100644 --- a/datafusion/functions-aggregate/src/sum.rs +++ b/datafusion/functions-aggregate/src/sum.rs @@ -21,6 +21,7 @@ use ahash::RandomState; use datafusion_expr::utils::AggregateOrderSensitivity; use std::any::Any; use std::collections::HashSet; +use std::sync::OnceLock; use arrow::array::Array; use arrow::array::ArrowNativeTypeOp; @@ -33,11 +34,13 @@ use arrow::datatypes::{ }; use arrow::{array::ArrayRef, datatypes::Field}; use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF, + Signature, Volatility, }; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use datafusion_functions_aggregate_common::utils::Hashable; @@ -233,6 +236,35 @@ impl AggregateUDFImpl for Sum { fn order_sensitivity(&self) -> AggregateOrderSensitivity { AggregateOrderSensitivity::Insensitive } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_sum_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_sum_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the sum of all values in the specified column.", + ) + .with_syntax_example("sum(expression)") + .with_sql_example(r#"```sql +> SELECT sum(column_name) FROM table_name; ++-----------------------+ +| sum(column_name) | ++-----------------------+ +| 12345 | ++-----------------------+ +```"#, + ) + .with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) } /// This accumulator computes SUM incrementally diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index a3d114221d3f..2ffe93a0e567 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -117,6 +117,11 @@ harness = false name = "make_date" required-features = ["datetime_expressions"] +[[bench]] +harness = false +name = "iszero" +required-features = ["math_expressions"] + [[bench]] harness = false name = "nullif" diff --git a/datafusion/functions/benches/iszero.rs b/datafusion/functions/benches/iszero.rs new file mode 100644 index 000000000000..3348d172e1f2 --- /dev/null +++ b/datafusion/functions/benches/iszero.rs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::{ + datatypes::{Float32Type, Float64Type}, + util::bench_util::create_primitive_array, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::math::iszero; +use std::sync::Arc; + +fn criterion_benchmark(c: &mut Criterion) { + let iszero = iszero(); + for size in [1024, 4096, 8192] { + let f32_array = Arc::new(create_primitive_array::(size, 0.2)); + let f32_args = vec![ColumnarValue::Array(f32_array)]; + c.bench_function(&format!("iszero f32 array: {}", size), |b| { + b.iter(|| black_box(iszero.invoke(&f32_args).unwrap())) + }); + let f64_array = Arc::new(create_primitive_array::(size, 0.2)); + let f64_args = vec![ColumnarValue::Array(f64_array)]; + c.bench_function(&format!("iszero f64 array: {}", size), |b| { + b.iter(|| black_box(iszero.invoke(&f64_args).unwrap())) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index a1b74228a503..a3e3feaa17e3 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -17,17 +17,19 @@ //! [`ArrowCastFunc`]: Implementation of the `arrow_cast` -use std::any::Any; - use arrow::datatypes::DataType; use datafusion_common::{ arrow_datafusion_err, internal_err, plan_datafusion_err, plan_err, DataFusionError, ExprSchema, Result, ScalarValue, }; +use std::any::Any; +use std::sync::OnceLock; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ - ColumnarValue, Expr, ExprSchemable, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, Expr, ExprSchemable, ScalarUDFImpl, Signature, + Volatility, }; /// Implements casting to arbitrary arrow types (rather than SQL types) @@ -131,6 +133,39 @@ impl ScalarUDFImpl for ArrowCastFunc { // return the newly written argument to DataFusion Ok(ExprSimplifyResult::Simplified(new_expr)) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_arrow_cast_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_arrow_cast_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Casts a value to a specific Arrow data type.") + .with_syntax_example("arrow_cast(expression, datatype)") + .with_sql_example( + r#"```sql +> select arrow_cast(-5, 'Int8') as a, + arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, + arrow_cast('bar', 'LargeUtf8') as c, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d + ; ++----+-----+-----+---------------------------+ +| a | b | c | d | ++----+-----+-----+---------------------------+ +| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | ++----+-----+-----+---------------------------+ +```"#, + ) + .with_argument("expression", "Expression to cast. The expression can be a constant, column, or function, and any combination of operators.") + .with_argument("datatype", "[Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`]") + .build() + .unwrap() + }) } /// Returns the requested type from the arguments diff --git a/datafusion/functions/src/core/arrowtypeof.rs b/datafusion/functions/src/core/arrowtypeof.rs index cc5e7e619bd8..a425aff6caad 100644 --- a/datafusion/functions/src/core/arrowtypeof.rs +++ b/datafusion/functions/src/core/arrowtypeof.rs @@ -17,9 +17,11 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct ArrowTypeOfFunc { @@ -69,4 +71,35 @@ impl ScalarUDFImpl for ArrowTypeOfFunc { "{input_data_type}" )))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_arrowtypeof_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_arrowtypeof_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression.", + ) + .with_syntax_example("arrow_typeof(expression)") + .with_sql_example( + r#"```sql +> select arrow_typeof('foo'), arrow_typeof(1); ++---------------------------+------------------------+ +| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) | ++---------------------------+------------------------+ +| Utf8 | Int64 | ++---------------------------+------------------------+ +``` +"#, + ) + .with_argument("expression", "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index d8ff44798f8a..15cd733a8cd6 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -47,23 +47,6 @@ impl CoalesceFunc { } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_coalesce_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder() - .with_doc_section(DOC_SECTION_CONDITIONAL) - .with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.") - .with_syntax_example("coalesce(expression1[, ..., expression_n])") - .with_argument( - "expression1, expression_n", - "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." - ) - .build() - .unwrap() - }) -} - impl ScalarUDFImpl for CoalesceFunc { fn as_any(&self) -> &dyn Any { self @@ -164,6 +147,32 @@ impl ScalarUDFImpl for CoalesceFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_coalesce_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.") + .with_syntax_example("coalesce(expression1[, ..., expression_n])") + .with_sql_example(r#"```sql +> select coalesce(null, null, 'datafusion'); ++----------------------------------------+ +| coalesce(NULL,NULL,Utf8("datafusion")) | ++----------------------------------------+ +| datafusion | ++----------------------------------------+ +```"#, + ) + .with_argument( + "expression1, expression_n", + "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." + ) + .build() + .unwrap() + }) +} + #[cfg(test)] mod test { use arrow::datatypes::DataType; diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index a51f895c5084..c0af4d35966b 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -23,10 +23,11 @@ use datafusion_common::cast::{as_map_array, as_struct_array}; use datafusion_common::{ exec_err, plan_datafusion_err, plan_err, ExprSchema, Result, ScalarValue, }; -use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct GetFieldFunc { @@ -133,7 +134,7 @@ impl ScalarUDFImpl for GetFieldFunc { DataType::Struct(fields) if fields.len() == 2 => { // Arrow's MapArray is essentially a ListArray of structs with two columns. They are // often named "key", and "value", but we don't require any specific naming here; - // instead, we assume that the second columnis the "value" column both here and in + // instead, we assume that the second column is the "value" column both here and in // execution. let value_field = fields.get(1).expect("fields should have exactly two members"); Ok(value_field.data_type().clone()) @@ -155,7 +156,7 @@ impl ScalarUDFImpl for GetFieldFunc { "Only UTF8 strings are valid as an indexed field in a struct" ), (DataType::Null, _) => Ok(DataType::Null), - (other, _) => plan_err!("The expression to get an indexed field is only valid for `List`, `Struct`, `Map` or `Null` types, got {other}"), + (other, _) => plan_err!("The expression to get an indexed field is only valid for `Struct`, `Map` or `Null` types, got {other}"), } } @@ -190,7 +191,7 @@ impl ScalarUDFImpl for GetFieldFunc { let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?; // note that this array has more entries than the expected output/input size - // because maparray is flatten + // because map_array is flattened let original_data = map_array.entries().column(1).to_data(); let capacity = Capacities::Array(original_data.len()); let mut mutable = @@ -205,7 +206,7 @@ impl ScalarUDFImpl for GetFieldFunc { keys.slice(start, end-start). iter().enumerate(). find(|(_, t)| t.unwrap()); - if maybe_matched.is_none(){ + if maybe_matched.is_none() { mutable.extend_nulls(1); continue } @@ -224,14 +225,67 @@ impl ScalarUDFImpl for GetFieldFunc { } } (DataType::Struct(_), name) => exec_err!( - "get indexed field is only possible on struct with utf8 indexes. \ - Tried with {name:?} index" + "get_field is only possible on struct with utf8 indexes. \ + Received with {name:?} index" ), (DataType::Null, _) => Ok(ColumnarValue::Scalar(ScalarValue::Null)), (dt, name) => exec_err!( - "get indexed field is only possible on lists with int64 indexes or struct \ - with utf8 indexes. Tried {dt:?} with {name:?} index" + "get_field is only possible on maps with utf8 indexes or struct \ + with utf8 indexes. Received {dt:?} with {name:?} index" ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_getfield_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_getfield_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description(r#"Returns a field within a map or a struct with the given key. +Note: most users invoke `get_field` indirectly via field access +syntax such as `my_struct_col['field_name']` which results in a call to +`get_field(my_struct_col, 'field_name')`."#) + .with_syntax_example("get_field(expression1, expression2)") + .with_sql_example(r#"```sql +> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow'); +> select struct(idx, v) from t as c; ++-------------------------+ +| struct(c.idx,c.v) | ++-------------------------+ +| {c0: data, c1: fusion} | +| {c0: apache, c1: arrow} | ++-------------------------+ +> select get_field((select struct(idx, v) from t), 'c0'); ++-----------------------+ +| struct(t.idx,t.v)[c0] | ++-----------------------+ +| data | +| apache | ++-----------------------+ +> select get_field((select struct(idx, v) from t), 'c1'); ++-----------------------+ +| struct(t.idx,t.v)[c1] | ++-----------------------+ +| fusion | +| arrow | ++-----------------------+ +``` + "#) + .with_argument( + "expression1", + "The map or struct to retrieve a field for." + ) + .with_argument( + "expression2", + "The field name in the map or struct to retrieve data for. Must evaluate to a string." + ) + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/named_struct.rs b/datafusion/functions/src/core/named_struct.rs index 85c332745355..342f99274aca 100644 --- a/datafusion/functions/src/core/named_struct.rs +++ b/datafusion/functions/src/core/named_struct.rs @@ -18,11 +18,12 @@ use arrow::array::StructArray; use arrow::datatypes::{DataType, Field, Fields}; use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRUCT; +use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use hashbrown::HashSet; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; /// put values in a struct array. fn named_struct_expr(args: &[ColumnarValue]) -> Result { @@ -161,4 +162,46 @@ impl ScalarUDFImpl for NamedStructFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { named_struct_expr(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_named_struct_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_named_struct_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRUCT) + .with_description("Returns an Arrow struct using the specified name and input expressions pairs.") + .with_syntax_example("named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input])") + .with_sql_example(r#" +For example, this query converts two columns `a` and `b` to a single column with +a struct type of fields `field_a` and `field_b`: +```sql +> select * from t; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | +| 3 | 4 | ++---+---+ +> select named_struct('field_a', a, 'field_b', b) from t; ++-------------------------------------------------------+ +| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) | ++-------------------------------------------------------+ +| {field_a: 1, field_b: 2} | +| {field_a: 3, field_b: 4} | ++-------------------------------------------------------+ +``` +"#) + .with_argument( + "expression_n_name", + "Name of the column field. Must be a constant string." + ) + .with_argument("expression_n_input", "Expression to include in the output struct. Can be a constant, column, or function, and any combination of arithmetic or string operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 6fcfbd36416e..f96ee1ea7a12 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -17,13 +17,15 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, Documentation}; use arrow::compute::kernels::cmp::eq; use arrow::compute::kernels::nullif::nullif; use datafusion_common::ScalarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct NullIfFunc { @@ -93,6 +95,47 @@ impl ScalarUDFImpl for NullIfFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { nullif_func(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nullif_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nullif_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns _null_ if _expression1_ equals _expression2_; otherwise it returns _expression1_. +This can be used to perform the inverse operation of [`coalesce`](#coalesce).") + .with_syntax_example("nullif(expression1, expression2)") + .with_sql_example(r#"```sql +> select nullif('datafusion', 'data'); ++-----------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("data")) | ++-----------------------------------------+ +| datafusion | ++-----------------------------------------+ +> select nullif('datafusion', 'datafusion'); ++-----------------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("datafusion")) | ++-----------------------------------------------+ +| | ++-----------------------------------------------+ +``` +"#) + .with_argument( + "expression1", + "Expression to compare and return if equal to expression2. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression2", + "Expression to compare to expression1. Can be a constant, column, or function, and any combination of operators." + ) + .build() + .unwrap() + }) } /// Implements NULLIF(expr1, expr2) diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs index a09224acefcd..16438e1b6254 100644 --- a/datafusion/functions/src/core/nvl.rs +++ b/datafusion/functions/src/core/nvl.rs @@ -20,8 +20,11 @@ use arrow::compute::is_not_null; use arrow::compute::kernels::zip::zip; use arrow::datatypes::DataType; use datafusion_common::{internal_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use std::sync::Arc; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct NVLFunc { @@ -91,6 +94,46 @@ impl ScalarUDFImpl for NVLFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nvl_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nvl_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_.") + .with_syntax_example("nvl(expression1, expression2)") + .with_sql_example(r#"```sql +> select nvl(null, 'a'); ++---------------------+ +| nvl(NULL,Utf8("a")) | ++---------------------+ +| a | ++---------------------+\ +> select nvl('b', 'a'); ++--------------------------+ +| nvl(Utf8("b"),Utf8("a")) | ++--------------------------+ +| b | ++--------------------------+ +``` +"#) + .with_argument( + "expression1", + "Expression to return if not null. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression2", + "Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators." + ) + .build() + .unwrap() + }) } fn nvl_func(args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs index 1144dc0fb7c5..cfcdb4480787 100644 --- a/datafusion/functions/src/core/nvl2.rs +++ b/datafusion/functions/src/core/nvl2.rs @@ -20,11 +20,12 @@ use arrow::compute::is_not_null; use arrow::compute::kernels::zip::zip; use arrow::datatypes::DataType; use datafusion_common::{exec_err, internal_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::{ - type_coercion::binary::comparison_coercion, ColumnarValue, ScalarUDFImpl, Signature, - Volatility, + type_coercion::binary::comparison_coercion, ColumnarValue, Documentation, + ScalarUDFImpl, Signature, Volatility, }; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct NVL2Func { @@ -90,6 +91,50 @@ impl ScalarUDFImpl for NVL2Func { )?; Ok(vec![new_type; arg_types.len()]) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_nvl2_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_nvl2_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns _expression2_ if _expression1_ is not NULL; otherwise it returns _expression3_.") + .with_syntax_example("nvl2(expression1, expression2, expression3)") + .with_sql_example(r#"```sql +> select nvl2(null, 'a', 'b'); ++--------------------------------+ +| nvl2(NULL,Utf8("a"),Utf8("b")) | ++--------------------------------+ +| b | ++--------------------------------+ +> select nvl2('data', 'a', 'b'); ++----------------------------------------+ +| nvl2(Utf8("data"),Utf8("a"),Utf8("b")) | ++----------------------------------------+ +| a | ++----------------------------------------+ +``` +"#) + .with_argument( + "expression1", + "Expression to test for null. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression2", + "Expression to return if expr1 is not null. Can be a constant, column, or function, and any combination of operators." + ) + .with_argument( + "expression3", + "Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators." + ) + .build() + .unwrap() + }) } fn nvl2_func(args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/core/struct.rs b/datafusion/functions/src/core/struct.rs index 5eea7dd48de8..75d1d4eca698 100644 --- a/datafusion/functions/src/core/struct.rs +++ b/datafusion/functions/src/core/struct.rs @@ -18,10 +18,11 @@ use arrow::array::{ArrayRef, StructArray}; use arrow::datatypes::{DataType, Field, Fields}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRUCT; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; fn array_struct(args: &[ArrayRef]) -> Result { // do not accept 0 arguments. @@ -103,4 +104,56 @@ impl ScalarUDFImpl for StructFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { struct_expr(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_struct_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_struct_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRUCT) + .with_description("Returns an Arrow struct using the specified input expressions optionally named. +Fields in the returned struct use the optional name or the `cN` naming convention. +For example: `c0`, `c1`, `c2`, etc.") + .with_syntax_example("struct(expression1[, ..., expression_n])") + .with_sql_example(r#"For example, this query converts two columns `a` and `b` to a single column with +a struct type of fields `field_a` and `c1`: +```sql +> select * from t; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | +| 3 | 4 | ++---+---+ + +-- use default names `c0`, `c1` +> select struct(a, b) from t; ++-----------------+ +| struct(t.a,t.b) | ++-----------------+ +| {c0: 1, c1: 2} | +| {c0: 3, c1: 4} | ++-----------------+ + +-- name the first field `field_a` +select struct(a as field_a, b) from t; ++--------------------------------------------------+ +| named_struct(Utf8("field_a"),t.a,Utf8("c1"),t.b) | ++--------------------------------------------------+ +| {field_a: 1, c1: 2} | +| {field_a: 3, c1: 4} | ++--------------------------------------------------+ +``` +"#) + .with_argument( + "expression1, expression_n", + "Expression to include in the output struct. Can be a constant, column, or function, any combination of arithmetic or string operators.") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/core/version.rs b/datafusion/functions/src/core/version.rs index 212349e68981..f726122c649a 100644 --- a/datafusion/functions/src/core/version.rs +++ b/datafusion/functions/src/core/version.rs @@ -17,11 +17,14 @@ //! [`VersionFunc`]: Implementation of the `version` function. -use std::any::Any; - use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, plan_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct VersionFunc { @@ -78,6 +81,33 @@ impl ScalarUDFImpl for VersionFunc { ); Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(version)))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_version_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_version_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns the version of DataFusion.") + .with_syntax_example("version()") + .with_sql_example( + r#"```sql +> select version(); ++--------------------------------------------+ +| version() | ++--------------------------------------------+ +| Apache DataFusion 42.0.0, aarch64 on macos | ++--------------------------------------------+ +```"#, + ) + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/math/iszero.rs b/datafusion/functions/src/math/iszero.rs index e6a728053359..74611b65aaba 100644 --- a/datafusion/functions/src/math/iszero.rs +++ b/datafusion/functions/src/math/iszero.rs @@ -18,11 +18,11 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, BooleanArray, Float32Array, Float64Array}; -use arrow::datatypes::DataType; +use arrow::array::{ArrayRef, AsArray, BooleanArray}; use arrow::datatypes::DataType::{Boolean, Float32, Float64}; +use arrow::datatypes::{DataType, Float32Type, Float64Type}; -use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -77,20 +77,14 @@ impl ScalarUDFImpl for IsZeroFunc { /// Iszero SQL function pub fn iszero(args: &[ArrayRef]) -> Result { match args[0].data_type() { - Float64 => Ok(Arc::new(make_function_scalar_inputs_return_type!( - &args[0], - "x", - Float64Array, - BooleanArray, - { |x: f64| { x == 0_f64 } } + Float64 => Ok(Arc::new(BooleanArray::from_unary( + args[0].as_primitive::(), + |x| x == 0.0, )) as ArrayRef), - Float32 => Ok(Arc::new(make_function_scalar_inputs_return_type!( - &args[0], - "x", - Float32Array, - BooleanArray, - { |x: f32| { x == 0_f32 } } + Float32 => Ok(Arc::new(BooleanArray::from_unary( + args[0].as_primitive::(), + |x| x == 0.0, )) as ArrayRef), other => exec_err!("Unsupported data type {other:?} for function iszero"), diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 029003374acc..6068e7526316 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -780,7 +780,7 @@ mod tests { }; sort_info.push(sort_expr); } - exec = exec.with_sort_information(vec![sort_info]); + exec = exec.try_with_sort_information(vec![sort_info]).unwrap(); } Arc::new(exec) diff --git a/datafusion/physical-plan/src/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs index 264f297ffb4c..090d60f0bac3 100644 --- a/datafusion/physical-plan/src/joins/test_utils.rs +++ b/datafusion/physical-plan/src/joins/test_utils.rs @@ -289,7 +289,7 @@ macro_rules! join_expr_tests { ScalarValue::$SCALAR(Some(10 as $type)), (Operator::Gt, Operator::Lt), ), - // left_col - 1 > right_col + 5 AND left_col + 3 < right_col + 10 + // left_col - 1 > right_col + 3 AND left_col + 3 < right_col + 15 1 => gen_conjunctive_numerical_expr( left_col, right_col, @@ -300,9 +300,9 @@ macro_rules! join_expr_tests { Operator::Plus, ), ScalarValue::$SCALAR(Some(1 as $type)), - ScalarValue::$SCALAR(Some(5 as $type)), ScalarValue::$SCALAR(Some(3 as $type)), - ScalarValue::$SCALAR(Some(10 as $type)), + ScalarValue::$SCALAR(Some(3 as $type)), + ScalarValue::$SCALAR(Some(15 as $type)), (Operator::Gt, Operator::Lt), ), // left_col - 1 > right_col + 5 AND left_col - 3 < right_col + 10 @@ -353,7 +353,8 @@ macro_rules! join_expr_tests { ScalarValue::$SCALAR(Some(3 as $type)), (Operator::Gt, Operator::Lt), ), - // left_col - 2 >= right_col - 5 AND left_col - 7 <= right_col - 3 + // left_col - 2 >= right_col + 5 AND left_col + 7 <= right_col - 3 + // (filters all input rows) 5 => gen_conjunctive_numerical_expr( left_col, right_col, @@ -369,7 +370,7 @@ macro_rules! join_expr_tests { ScalarValue::$SCALAR(Some(3 as $type)), (Operator::GtEq, Operator::LtEq), ), - // left_col - 28 >= right_col - 11 AND left_col - 21 <= right_col - 39 + // left_col + 28 >= right_col - 11 AND left_col + 21 <= right_col + 39 6 => gen_conjunctive_numerical_expr( left_col, right_col, @@ -385,7 +386,7 @@ macro_rules! join_expr_tests { ScalarValue::$SCALAR(Some(39 as $type)), (Operator::Gt, Operator::LtEq), ), - // left_col - 28 >= right_col - 11 AND left_col - 21 <= right_col + 39 + // left_col + 28 >= right_col - 11 AND left_col - 21 <= right_col + 39 7 => gen_conjunctive_numerical_expr( left_col, right_col, @@ -526,10 +527,10 @@ pub fn create_memory_table( ) -> Result<(Arc, Arc)> { let left_schema = left_partition[0].schema(); let left = MemoryExec::try_new(&[left_partition], left_schema, None)? - .with_sort_information(left_sorted); + .try_with_sort_information(left_sorted)?; let right_schema = right_partition[0].schema(); let right = MemoryExec::try_new(&[right_partition], right_schema, None)? - .with_sort_information(right_sorted); + .try_with_sort_information(right_sorted)?; Ok((Arc::new(left), Arc::new(right))) } diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 3aa445d295cb..456f0ef2dcc8 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -33,6 +33,9 @@ use arrow::record_batch::RecordBatch; use datafusion_common::{internal_err, project_schema, Result}; use datafusion_execution::memory_pool::MemoryReservation; use datafusion_execution::TaskContext; +use datafusion_physical_expr::equivalence::ProjectionMapping; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::utils::collect_columns; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; use futures::Stream; @@ -206,16 +209,63 @@ impl MemoryExec { /// where both `a ASC` and `b DESC` can describe the table ordering. With /// [`EquivalenceProperties`], we can keep track of these equivalences /// and treat `a ASC` and `b DESC` as the same ordering requirement. - pub fn with_sort_information(mut self, sort_information: Vec) -> Self { - self.sort_information = sort_information; + /// + /// Note that if there is an internal projection, that projection will be + /// also applied to the given `sort_information`. + pub fn try_with_sort_information( + mut self, + mut sort_information: Vec, + ) -> Result { + // All sort expressions must refer to the original schema + let fields = self.schema.fields(); + let ambiguous_column = sort_information + .iter() + .flatten() + .flat_map(|expr| collect_columns(&expr.expr)) + .find(|col| { + fields + .get(col.index()) + .map(|field| field.name() != col.name()) + .unwrap_or(true) + }); + if let Some(col) = ambiguous_column { + return internal_err!( + "Column {:?} is not found in the original schema of the MemoryExec", + col + ); + } + + // If there is a projection on the source, we also need to project orderings + if let Some(projection) = &self.projection { + let base_eqp = EquivalenceProperties::new_with_orderings( + self.original_schema(), + &sort_information, + ); + let proj_exprs = projection + .iter() + .map(|idx| { + let base_schema = self.original_schema(); + let name = base_schema.field(*idx).name(); + (Arc::new(Column::new(name, *idx)) as _, name.to_string()) + }) + .collect::>(); + let projection_mapping = + ProjectionMapping::try_new(&proj_exprs, &self.original_schema())?; + sort_information = base_eqp + .project(&projection_mapping, self.schema()) + .oeq_class + .orderings; + } + self.sort_information = sort_information; // We need to update equivalence properties when updating sort information. let eq_properties = EquivalenceProperties::new_with_orderings( self.schema(), &self.sort_information, ); self.cache = self.cache.with_eq_properties(eq_properties); - self + + Ok(self) } pub fn original_schema(&self) -> SchemaRef { @@ -347,7 +397,7 @@ mod tests { let sort_information = vec![sort1.clone(), sort2.clone()]; let mem_exec = MemoryExec::try_new(&[vec![]], schema, None)? - .with_sort_information(sort_information); + .try_with_sort_information(sort_information)?; assert_eq!( mem_exec.properties().output_ordering().unwrap(), diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index d9368cf86d45..902d9f4477bc 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1677,7 +1677,8 @@ mod test { Arc::new( MemoryExec::try_new(&[vec![]], Arc::clone(schema), None) .unwrap() - .with_sort_information(vec![sort_exprs]), + .try_with_sort_information(vec![sort_exprs]) + .unwrap(), ) } } diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 1cf22060b62a..108e42e7be42 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -809,11 +809,11 @@ mod tests { .collect::>(); let child1 = Arc::new( MemoryExec::try_new(&[], Arc::clone(&schema), None)? - .with_sort_information(first_orderings), + .try_with_sort_information(first_orderings)?, ); let child2 = Arc::new( MemoryExec::try_new(&[], Arc::clone(&schema), None)? - .with_sort_information(second_orderings), + .try_with_sort_information(second_orderings)?, ); let mut union_expected_eq = EquivalenceProperties::new(Arc::clone(&schema)); diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index c029fe2a23d8..69c7745165f4 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -477,6 +477,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let filter_expr = self.sql_to_expr(predicate_expr, plan.schema(), planner_context)?; + + // Check for aggregation functions + let aggregate_exprs = + find_aggregate_exprs(std::slice::from_ref(&filter_expr)); + if !aggregate_exprs.is_empty() { + return plan_err!( + "Aggregate functions are not allowed in the WHERE clause. Consider using HAVING instead" + ); + } + let mut using_columns = HashSet::new(); expr_to_columns(&filter_expr, &mut using_columns)?; let filter_expr = normalize_col_with_schemas_and_ambiguity_check( diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 3d5c37372aca..f03c3700ab9f 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -6076,3 +6076,13 @@ ORDER BY k; ---- 1 1.8125 6.8007813 Float16 Float16 2 8.5 8.5 Float16 Float16 + +statement ok +CREATE TABLE t1(v1 int); + +# issue: https://github.com/apache/datafusion/issues/12814 +statement error DataFusion error: Error during planning: Aggregate functions are not allowed in the WHERE clause. Consider using HAVING instead +SELECT v1 FROM t1 WHERE ((count(v1) % 1) << 1) > 0; + +statement ok +DROP TABLE t1; diff --git a/docs/source/index.rst b/docs/source/index.rst index f11670d259bf..27dd58cf50f4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -36,9 +36,11 @@ Apache DataFusion DataFusion is an extensible query engine written in `Rust `_ that uses `Apache Arrow `_ as its in-memory format. -This documentation is for the `core DataFusion project `_, which contains -libraries that are used to build data-centric system software. DataFusion also offers the following subprojects, which -provide packaged versions of DataFusion intended for end users, and these have separate documentation. +The documentation on this site is for the `core DataFusion project `_, which contains +libraries and binaries for developers building fast and feature rich database and analytic systems, +customized to particular workloads. See `use cases `_ for examples. + +The following related subprojects target end users and have separate documentation. - `DataFusion Python `_ offers a Python interface for SQL and DataFrame queries. @@ -47,10 +49,6 @@ provide packaged versions of DataFusion intended for end users, and these have s - `DataFusion Comet `_ is an accelerator for Apache Spark based on DataFusion. -DataFusion's target users are -developers building fast and feature rich database and analytic systems, -customized to particular workloads. See `use cases `_ for examples. - "Out of the box," DataFusion offers `SQL `_ and `Dataframe `_ APIs, excellent `performance `_, built-in support for CSV, Parquet, JSON, and Avro, diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index 1c25874c0806..4f774fe6d0f0 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -27,183 +27,9 @@ the rest of the documentation. [automatically created from the codebase]: https://github.com/apache/datafusion/issues/12740 -## General - -- [avg](#avg) -- [bool_and](#bool_and) -- [bool_or](#bool_or) -- [count](#count) -- [max](#max) -- [mean](#mean) -- [median](#median) -- [min](#min) -- [sum](#sum) -- [array_agg](#array_agg) -- [first_value](#first_value) -- [last_value](#last_value) - -### `avg` - -Returns the average of numeric values in the specified column. - -``` -avg(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -#### Aliases - -- `mean` - -### `bool_and` - -Returns true if all non-null input values are true, otherwise false. - -``` -bool_and(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `bool_or` - -Returns true if any non-null input value is true, otherwise false. - -``` -bool_or(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `count` - -Returns the number of non-null values in the specified column. - -To include _null_ values in the total count, use `count(*)`. - -``` -count(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `max` - -Returns the maximum value in the specified column. - -``` -max(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `mean` - -_Alias of [avg](#avg)._ - -### `median` - -Returns the median value in the specified column. - -``` -median(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `min` - -Returns the minimum value in the specified column. - -``` -min(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `sum` - -Returns the sum of all values in the specified column. - -``` -sum(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `array_agg` - -Returns an array created from the expression elements. If ordering requirement is given, elements are inserted in the order of required ordering. - -``` -array_agg(expression [ORDER BY expression]) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `first_value` - -Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. - -``` -first_value(expression [ORDER BY expression]) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `last_value` - -Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. - -``` -last_value(expression [ORDER BY expression]) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - ## Statistical -- [corr](#corr) - [covar](#covar) -- [covar_pop](#covar_pop) -- [covar_samp](#covar_samp) -- [stddev](#stddev) -- [stddev_pop](#stddev_pop) -- [stddev_samp](#stddev_samp) - [regr_avgx](#regr_avgx) - [regr_avgy](#regr_avgy) - [regr_count](#regr_count) @@ -214,21 +40,6 @@ last_value(expression [ORDER BY expression]) - [regr_syy](#regr_syy) - [regr_sxy](#regr_sxy) -### `corr` - -Returns the coefficient of correlation between two numeric values. - -``` -corr(expression1, expression2) -``` - -#### Arguments - -- **expression1**: First expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Second expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - ### `covar` Returns the covariance of a set of number pairs. @@ -244,75 +55,6 @@ covar(expression1, expression2) - **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. -### `covar_pop` - -Returns the population covariance of a set of number pairs. - -``` -covar_pop(expression1, expression2) -``` - -#### Arguments - -- **expression1**: First expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Second expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `covar_samp` - -Returns the sample covariance of a set of number pairs. - -``` -covar_samp(expression1, expression2) -``` - -#### Arguments - -- **expression1**: First expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Second expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `stddev` - -Returns the standard deviation of a set of numbers. - -``` -stddev(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `stddev_pop` - -Returns the population standard deviation of a set of numbers. - -``` -stddev_pop(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `stddev_samp` - -Returns the sample standard deviation of a set of numbers. - -``` -stddev_samp(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - ### `regr_slope` Returns the slope of the linear regression line for non-null pairs in aggregate columns. @@ -448,74 +190,3 @@ regr_sxy(expression_y, expression_x) Can be a constant, column, or function, and any combination of arithmetic operators. - **expression_x**: Independent variable. Can be a constant, column, or function, and any combination of arithmetic operators. - -## Approximate - -- [approx_distinct](#approx_distinct) -- [approx_median](#approx_median) -- [approx_percentile_cont](#approx_percentile_cont) -- [approx_percentile_cont_with_weight](#approx_percentile_cont_with_weight) - -### `approx_distinct` - -Returns the approximate number of distinct input values calculated using the -HyperLogLog algorithm. - -``` -approx_distinct(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `approx_median` - -Returns the approximate median (50th percentile) of input values. -It is an alias of `approx_percentile_cont(x, 0.5)`. - -``` -approx_median(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `approx_percentile_cont` - -Returns the approximate percentile of input values using the t-digest algorithm. - -``` -approx_percentile_cont(expression, percentile, centroids) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). -- **centroids**: Number of centroids to use in the t-digest algorithm. _Default is 100_. - - If there are this number or fewer unique values, you can expect an exact result. - A higher number of centroids results in a more accurate approximation, but - requires more memory to compute. - -### `approx_percentile_cont_with_weight` - -Returns the weighted approximate percentile of input values using the -t-digest algorithm. - -``` -approx_percentile_cont_with_weight(expression, weight, percentile) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **weight**: Expression to use as weight. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md index 08cdb0a9867c..fc918c3b15ea 100644 --- a/docs/source/user-guide/sql/aggregate_functions_new.md +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -37,15 +37,80 @@ Aggregate functions operate on a set of values to compute a single result. ## General Functions +- [array_agg](#array_agg) +- [avg](#avg) - [bit_and](#bit_and) - [bit_or](#bit_or) - [bit_xor](#bit_xor) +- [bool_and](#bool_and) +- [bool_or](#bool_or) +- [count](#count) +- [first_value](#first_value) +- [grouping](#grouping) +- [last_value](#last_value) +- [max](#max) +- [mean](#mean) +- [median](#median) +- [min](#min) +- [string_agg](#string_agg) +- [sum](#sum) - [var](#var) - [var_pop](#var_pop) - [var_population](#var_population) - [var_samp](#var_samp) - [var_sample](#var_sample) +### `array_agg` + +Returns an array created from the expression elements. If ordering is required, elements are inserted in the specified order. + +``` +array_agg(expression [ORDER BY expression]) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT array_agg(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| array_agg(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| [element1, element2, element3] | ++-----------------------------------------------+ +``` + +### `avg` + +Returns the average of numeric values in the specified column. + +``` +avg(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **Aliases: **: `mean` + +#### Example + +```sql +> SELECT avg(column_name) FROM table_name; ++---------------------------+ +| avg(column_name) | ++---------------------------+ +| 42.75 | ++---------------------------+ +``` + +#### Aliases + +- mean + ### `bit_and` Computes the bitwise AND of all non-null input values. @@ -82,6 +147,276 @@ bit_xor(expression) - **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. +### `bool_and` + +Returns true if all non-null input values are true, otherwise false. + +``` +bool_and(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +``` + +### `bool_or` + +Returns true if all non-null input values are true, otherwise false. + +``` +bool_and(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +``` + +### `count` + +Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`. + +``` +count(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT count(column_name) FROM table_name; ++-----------------------+ +| count(column_name) | ++-----------------------+ +| 100 | ++-----------------------+ + +> SELECT count(*) FROM table_name; ++------------------+ +| count(*) | ++------------------+ +| 120 | ++------------------+ +``` + +### `first_value` + +Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. + +``` +first_value(expression [ORDER BY expression]) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT first_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| first_value(column_name ORDER BY other_column)| ++-----------------------------------------------+ +| first_element | ++-----------------------------------------------+ +``` + +### `grouping` + +Returns 1 if the data is aggregated across the specified column, or 0 if it is not aggregated in the result set. + +``` +grouping(expression) +``` + +#### Arguments + +- **expression**: Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function. + +#### Example + +```sql +> SELECT column_name, GROUPING(column_name) AS group_column + FROM table_name + GROUP BY GROUPING SETS ((column_name), ()); ++-------------+-------------+ +| column_name | group_column | ++-------------+-------------+ +| value1 | 0 | +| value2 | 0 | +| NULL | 1 | ++-------------+-------------+ +``` + +### `last_value` + +Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. + +``` +first_value(expression [ORDER BY expression]) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT first_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| first_value(column_name ORDER BY other_column)| ++-----------------------------------------------+ +| first_element | ++-----------------------------------------------+ +``` + +### `max` + +Returns the maximum value in the specified column. + +``` +max(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT max(column_name) FROM table_name; ++----------------------+ +| max(column_name) | ++----------------------+ +| 150 | ++----------------------+ +``` + +### `mean` + +_Alias of [avg](#avg)._ + +### `median` + +Returns the median value in the specified column. + +``` +median(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT median(column_name) FROM table_name; ++----------------------+ +| median(column_name) | ++----------------------+ +| 45.5 | ++----------------------+ +``` + +### `min` + +Returns the maximum value in the specified column. + +``` +max(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT max(column_name) FROM table_name; ++----------------------+ +| max(column_name) | ++----------------------+ +| 150 | ++----------------------+ +``` + +### `string_agg` + +Concatenates the values of string expressions and places separator values between them. + +``` +string_agg(expression, delimiter) +``` + +#### Arguments + +- **expression**: The string expression to concatenate. Can be a column or any valid string expression. +- **delimiter**: A literal string used as a separator between the concatenated values. + +#### Example + +```sql +> SELECT string_agg(name, ', ') AS names_list + FROM employee; ++--------------------------+ +| names_list | ++--------------------------+ +| Alice, Bob, Charlie | ++--------------------------+ +``` + +### `sum` + +Returns the sum of all values in the specified column. + +``` +sum(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT sum(column_name) FROM table_name; ++-----------------------+ +| sum(column_name) | ++-----------------------+ +| 12345 | ++-----------------------+ +``` + ### `var` Returns the statistical sample variance of a set of numbers. @@ -126,3 +461,280 @@ _Alias of [var](#var)._ ### `var_sample` _Alias of [var](#var)._ + +## Statistical Functions + +- [corr](#corr) +- [covar](#covar) +- [covar_pop](#covar_pop) +- [covar_samp](#covar_samp) +- [nth_value](#nth_value) +- [stddev](#stddev) +- [stddev_pop](#stddev_pop) +- [stddev_samp](#stddev_samp) + +### `corr` + +Returns the coefficient of correlation between two numeric values. + +``` +corr(expression1, expression2) +``` + +#### Arguments + +- **expression1**: First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT corr(column1, column2) FROM table_name; ++--------------------------------+ +| corr(column1, column2) | ++--------------------------------+ +| 0.85 | ++--------------------------------+ +``` + +### `covar` + +_Alias of [covar_samp](#covar_samp)._ + +### `covar_pop` + +Returns the sample covariance of a set of number pairs. + +``` +covar_samp(expression1, expression2) +``` + +#### Arguments + +- **expression1**: First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT covar_samp(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_samp(column1, column2) | ++-----------------------------------+ +| 8.25 | ++-----------------------------------+ +``` + +### `covar_samp` + +Returns the sample covariance of a set of number pairs. + +``` +covar_samp(expression1, expression2) +``` + +#### Arguments + +- **expression1**: First expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression2**: Second expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT covar_samp(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_samp(column1, column2) | ++-----------------------------------+ +| 8.25 | ++-----------------------------------+ +``` + +#### Aliases + +- covar + +### `nth_value` + +Returns the nth value in a group of values. + +``` +nth_value(expression, n ORDER BY expression) +``` + +#### Arguments + +- **expression**: The column or expression to retrieve the nth value from. expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: The position (nth) of the value to retrieve, based on the ordering. + +#### Example + +```sql +> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id ORDER BY salary ASC) AS second_salary_by_dept + FROM employee; ++---------+--------+-------------------------+ +| dept_id | salary | second_salary_by_dept | ++---------+--------+-------------------------+ +| 1 | 30000 | NULL | +| 1 | 40000 | 40000 | +| 1 | 50000 | 40000 | +| 2 | 35000 | NULL | +| 2 | 45000 | 45000 | ++---------+--------+-------------------------+ +``` + +### `stddev` + +Returns the standard deviation of a set of numbers. + +``` +stddev(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT stddev(column_name) FROM table_name; ++----------------------+ +| stddev(column_name) | ++----------------------+ +| 12.34 | ++----------------------+ +``` + +#### Aliases + +- stddev_samp + +### `stddev_pop` + +Returns the standard deviation of a set of numbers. + +``` +stddev(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT stddev(column_name) FROM table_name; ++----------------------+ +| stddev(column_name) | ++----------------------+ +| 12.34 | ++----------------------+ +``` + +### `stddev_samp` + +_Alias of [stddev](#stddev)._ + +## Approximate Functions + +- [approx_distinct](#approx_distinct) +- [approx_median](#approx_median) +- [approx_percentile_cont](#approx_percentile_cont) +- [approx_percentile_cont_with_weight](#approx_percentile_cont_with_weight) + +### `approx_distinct` + +Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm. + +``` +approx_distinct(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT approx_distinct(column_name) FROM table_name; ++-----------------------------------+ +| approx_distinct(column_name) | ++-----------------------------------+ +| 42 | ++-----------------------------------+ +``` + +### `approx_median` + +Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`. + +``` +approx_median(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +#### Example + +```sql +> SELECT approx_median(column_name) FROM table_name; ++-----------------------------------+ +| approx_median(column_name) | ++-----------------------------------+ +| 23.5 | ++-----------------------------------+ +``` + +### `approx_percentile_cont` + +Returns the approximate percentile of input values using the t-digest algorithm. + +``` +approx_percentile_cont(expression, percentile, centroids) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). +- **centroids**: Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory. + +#### Example + +```sql +> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; ++-------------------------------------------------+ +| approx_percentile_cont(column_name, 0.75, 100) | ++-------------------------------------------------+ +| 65.0 | ++-------------------------------------------------+ +``` + +### `approx_percentile_cont_with_weight` + +Returns the weighted approximate percentile of input values using the t-digest algorithm. + +``` +approx_percentile_cont_with_weight(expression, weight, percentile) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **weight**: Expression to use as weight. Can be a constant, column, or function, and any combination of arithmetic operators. +- **percentile**: Percentile to compute. Must be a float value between 0 and 1 (inclusive). + +#### Example + +```sql +> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; ++----------------------------------------------------------------------+ +| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | ++----------------------------------------------------------------------+ +| 78.5 | ++----------------------------------------------------------------------+ +``` diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 480767389086..95762d958521 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -557,62 +557,7 @@ trunc(numeric_expression[, decimal_places]) ## Conditional Functions -- [nullif](#nullif) -- [nvl](#nvl) -- [nvl2](#nvl2) -- [ifnull](#ifnull) - -### `nullif` - -Returns _null_ if _expression1_ equals _expression2_; otherwise it returns _expression1_. -This can be used to perform the inverse operation of [`coalesce`](#coalesce). - -``` -nullif(expression1, expression2) -``` - -#### Arguments - -- **expression1**: Expression to compare and return if equal to expression2. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: Expression to compare to expression1. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `nvl` - -Returns _expression2_ if _expression1_ is NULL; otherwise it returns _expression1_. - -``` -nvl(expression1, expression2) -``` - -#### Arguments - -- **expression1**: return if expression1 not is NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: return if expression1 is NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `nvl2` - -Returns _expression2_ if _expression1_ is not NULL; otherwise it returns _expression3_. - -``` -nvl2(expression1, expression2, expression3) -``` - -#### Arguments - -- **expression1**: conditional expression. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression2**: return if expression1 is not NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. -- **expression3**: return if expression1 is NULL. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `ifnull` - -_Alias of [nvl](#nvl)._ +See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) ## String Functions @@ -2806,93 +2751,9 @@ are not allowed ## Struct Functions -- [struct](#struct) -- [named_struct](#named_struct) - [unnest](#unnest-struct) -### `struct` - -Returns an Arrow struct using the specified input expressions optionally named. -Fields in the returned struct use the optional name or the `cN` naming convention. -For example: `c0`, `c1`, `c2`, etc. - -``` -struct(expression1[, ..., expression_n]) -``` - -For example, this query converts two columns `a` and `b` to a single column with -a struct type of fields `field_a` and `c1`: - -``` -select * from t; -+---+---+ -| a | b | -+---+---+ -| 1 | 2 | -| 3 | 4 | -+---+---+ - --- use default names `c0`, `c1` -> select struct(a, b) from t; -+-----------------+ -| struct(t.a,t.b) | -+-----------------+ -| {c0: 1, c1: 2} | -| {c0: 3, c1: 4} | -+-----------------+ - --- name the first field `field_a` -select struct(a as field_a, b) from t; -+--------------------------------------------------+ -| named_struct(Utf8("field_a"),t.a,Utf8("c1"),t.b) | -+--------------------------------------------------+ -| {field_a: 1, c1: 2} | -| {field_a: 3, c1: 4} | -+--------------------------------------------------+ -``` - -#### Arguments - -- **expression_n**: Expression to include in the output struct. - Can be a constant, column, or function, any combination of arithmetic or - string operators, or a named expression of previous listed . - -### `named_struct` - -Returns an Arrow struct using the specified name and input expressions pairs. - -``` -named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input]) -``` - -For example, this query converts two columns `a` and `b` to a single column with -a struct type of fields `field_a` and `field_b`: - -``` -select * from t; -+---+---+ -| a | b | -+---+---+ -| 1 | 2 | -| 3 | 4 | -+---+---+ - -select named_struct('field_a', a, 'field_b', b) from t; -+-------------------------------------------------------+ -| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) | -+-------------------------------------------------------+ -| {field_a: 1, field_b: 2} | -| {field_a: 3, field_b: 4} | -+-------------------------------------------------------+ -``` - -#### Arguments - -- **expression_n_name**: Name of the column field. - Must be a constant string. -- **expression_n_input**: Expression to include in the output struct. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. +For more struct functions see the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) ### `unnest (struct)` @@ -3068,83 +2929,4 @@ select map_values(map([100, 5], [42,43])); ## Other Functions -- [arrow_cast](#arrow_cast) -- [arrow_typeof](#arrow_typeof) -- [version](#version) - -### `arrow_cast` - -Casts a value to a specific Arrow data type: - -``` -arrow_cast(expression, datatype) -``` - -#### Arguments - -- **expression**: Expression to cast. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. -- **datatype**: [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name - to cast to, as a string. The format is the same as that returned by [`arrow_typeof`] - -#### Example - -``` -> select arrow_cast(-5, 'Int8') as a, - arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, - arrow_cast('bar', 'LargeUtf8') as c, - arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d - ; -+----+-----+-----+---------------------------+ -| a | b | c | d | -+----+-----+-----+---------------------------+ -| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | -+----+-----+-----+---------------------------+ -1 row in set. Query took 0.001 seconds. -``` - -### `arrow_typeof` - -Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression: - -``` -arrow_typeof(expression) -``` - -#### Arguments - -- **expression**: Expression to evaluate. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. - -#### Example - -``` -> select arrow_typeof('foo'), arrow_typeof(1); -+---------------------------+------------------------+ -| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) | -+---------------------------+------------------------+ -| Utf8 | Int64 | -+---------------------------+------------------------+ -1 row in set. Query took 0.001 seconds. -``` - -### `version` - -Returns the version of DataFusion. - -``` -version() -``` - -#### Example - -``` -> select version(); -+--------------------------------------------+ -| version() | -+--------------------------------------------+ -| Apache DataFusion 41.0.0, aarch64 on macos | -+--------------------------------------------+ -``` +See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md index 673c55f46b15..96fbcaa1104b 100644 --- a/docs/source/user-guide/sql/scalar_functions_new.md +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -54,6 +54,10 @@ log(numeric_expression) ## Conditional Functions - [coalesce](#coalesce) +- [ifnull](#ifnull) +- [nullif](#nullif) +- [nvl](#nvl) +- [nvl2](#nvl2) ### `coalesce` @@ -67,6 +71,117 @@ coalesce(expression1[, ..., expression_n]) - **expression1, expression_n**: Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary. +#### Example + +```sql +> select coalesce(null, null, 'datafusion'); ++----------------------------------------+ +| coalesce(NULL,NULL,Utf8("datafusion")) | ++----------------------------------------+ +| datafusion | ++----------------------------------------+ +``` + +### `ifnull` + +_Alias of [nvl](#nvl)._ + +### `nullif` + +Returns _null_ if _expression1_ equals _expression2_; otherwise it returns _expression1_. +This can be used to perform the inverse operation of [`coalesce`](#coalesce). + +``` +nullif(expression1, expression2) +``` + +#### Arguments + +- **expression1**: Expression to compare and return if equal to expression2. Can be a constant, column, or function, and any combination of operators. +- **expression2**: Expression to compare to expression1. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select nullif('datafusion', 'data'); ++-----------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("data")) | ++-----------------------------------------+ +| datafusion | ++-----------------------------------------+ +> select nullif('datafusion', 'datafusion'); ++-----------------------------------------------+ +| nullif(Utf8("datafusion"),Utf8("datafusion")) | ++-----------------------------------------------+ +| | ++-----------------------------------------------+ +``` + +### `nvl` + +Returns _expression2_ if _expression1_ is NULL otherwise it returns _expression1_. + +``` +nvl(expression1, expression2) +``` + +#### Arguments + +- **expression1**: Expression to return if not null. Can be a constant, column, or function, and any combination of operators. +- **expression2**: Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select nvl(null, 'a'); ++---------------------+ +| nvl(NULL,Utf8("a")) | ++---------------------+ +| a | ++---------------------+\ +> select nvl('b', 'a'); ++--------------------------+ +| nvl(Utf8("b"),Utf8("a")) | ++--------------------------+ +| b | ++--------------------------+ +``` + +#### Aliases + +- ifnull + +### `nvl2` + +Returns _expression2_ if _expression1_ is not NULL; otherwise it returns _expression3_. + +``` +nvl2(expression1, expression2, expression3) +``` + +#### Arguments + +- **expression1**: Expression to test for null. Can be a constant, column, or function, and any combination of operators. +- **expression2**: Expression to return if expr1 is not null. Can be a constant, column, or function, and any combination of operators. +- **expression3**: Expression to return if expr1 is null. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select nvl2(null, 'a', 'b'); ++--------------------------------+ +| nvl2(NULL,Utf8("a"),Utf8("b")) | ++--------------------------------+ +| b | ++--------------------------------+ +> select nvl2('data', 'a', 'b'); ++----------------------------------------+ +| nvl2(Utf8("data"),Utf8("a"),Utf8("b")) | ++----------------------------------------+ +| a | ++----------------------------------------+ +``` + ## String Functions - [ascii](#ascii) @@ -1159,6 +1274,102 @@ to_date('2017-05-31', '%Y-%m-%d') Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) +## Struct Functions + +- [named_struct](#named_struct) +- [row](#row) +- [struct](#struct) + +### `named_struct` + +Returns an Arrow struct using the specified name and input expressions pairs. + +``` +named_struct(expression1_name, expression1_input[, ..., expression_n_name, expression_n_input]) +``` + +#### Arguments + +- **expression_n_name**: Name of the column field. Must be a constant string. +- **expression_n_input**: Expression to include in the output struct. Can be a constant, column, or function, and any combination of arithmetic or string operators. + +#### Example + +For example, this query converts two columns `a` and `b` to a single column with +a struct type of fields `field_a` and `field_b`: + +```sql +> select * from t; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | +| 3 | 4 | ++---+---+ +> select named_struct('field_a', a, 'field_b', b) from t; ++-------------------------------------------------------+ +| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) | ++-------------------------------------------------------+ +| {field_a: 1, field_b: 2} | +| {field_a: 3, field_b: 4} | ++-------------------------------------------------------+ +``` + +### `row` + +_Alias of [struct](#struct)._ + +### `struct` + +Returns an Arrow struct using the specified input expressions optionally named. +Fields in the returned struct use the optional name or the `cN` naming convention. +For example: `c0`, `c1`, `c2`, etc. + +``` +struct(expression1[, ..., expression_n]) +``` + +#### Arguments + +- **expression1, expression_n**: Expression to include in the output struct. Can be a constant, column, or function, any combination of arithmetic or string operators. + +#### Example + +For example, this query converts two columns `a` and `b` to a single column with +a struct type of fields `field_a` and `c1`: + +```sql +> select * from t; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | +| 3 | 4 | ++---+---+ + +-- use default names `c0`, `c1` +> select struct(a, b) from t; ++-----------------+ +| struct(t.a,t.b) | ++-----------------+ +| {c0: 1, c1: 2} | +| {c0: 3, c1: 4} | ++-----------------+ + +-- name the first field `field_a` +select struct(a as field_a, b) from t; ++--------------------------------------------------+ +| named_struct(Utf8("field_a"),t.a,Utf8("c1"),t.b) | ++--------------------------------------------------+ +| {field_a: 1, c1: 2} | +| {field_a: 3, c1: 4} | ++--------------------------------------------------+ +``` + +#### Aliases + +- row + ## Hashing Functions - [digest](#digest) @@ -1314,3 +1525,123 @@ sha512(expression) | | +-------------------------------------------+ ``` + +## Other Functions + +- [arrow_cast](#arrow_cast) +- [arrow_typeof](#arrow_typeof) +- [get_field](#get_field) +- [version](#version) + +### `arrow_cast` + +Casts a value to a specific Arrow data type. + +``` +arrow_cast(expression, datatype) +``` + +#### Arguments + +- **expression**: Expression to cast. The expression can be a constant, column, or function, and any combination of operators. +- **datatype**: [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to cast to, as a string. The format is the same as that returned by [`arrow_typeof`] + +#### Example + +```sql +> select arrow_cast(-5, 'Int8') as a, + arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b, + arrow_cast('bar', 'LargeUtf8') as c, + arrow_cast('2023-01-02T12:53:02', 'Timestamp(Microsecond, Some("+08:00"))') as d + ; ++----+-----+-----+---------------------------+ +| a | b | c | d | ++----+-----+-----+---------------------------+ +| -5 | foo | bar | 2023-01-02T12:53:02+08:00 | ++----+-----+-----+---------------------------+ +``` + +### `arrow_typeof` + +Returns the name of the underlying [Arrow data type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the expression. + +``` +arrow_typeof(expression) +``` + +#### Arguments + +- **expression**: Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select arrow_typeof('foo'), arrow_typeof(1); ++---------------------------+------------------------+ +| arrow_typeof(Utf8("foo")) | arrow_typeof(Int64(1)) | ++---------------------------+------------------------+ +| Utf8 | Int64 | ++---------------------------+------------------------+ +``` + +### `get_field` + +Returns a field within a map or a struct with the given key. +Note: most users invoke `get_field` indirectly via field access +syntax such as `my_struct_col['field_name']` which results in a call to +`get_field(my_struct_col, 'field_name')`. + +``` +get_field(expression1, expression2) +``` + +#### Arguments + +- **expression1**: The map or struct to retrieve a field for. +- **expression2**: The field name in the map or struct to retrieve data for. Must evaluate to a string. + +#### Example + +```sql +> create table t (idx varchar, v varchar) as values ('data','fusion'), ('apache', 'arrow'); +> select struct(idx, v) from t as c; ++-------------------------+ +| struct(c.idx,c.v) | ++-------------------------+ +| {c0: data, c1: fusion} | +| {c0: apache, c1: arrow} | ++-------------------------+ +> select get_field((select struct(idx, v) from t), 'c0'); ++-----------------------+ +| struct(t.idx,t.v)[c0] | ++-----------------------+ +| data | +| apache | ++-----------------------+ +> select get_field((select struct(idx, v) from t), 'c1'); ++-----------------------+ +| struct(t.idx,t.v)[c1] | ++-----------------------+ +| fusion | +| arrow | ++-----------------------+ +``` + +### `version` + +Returns the version of DataFusion. + +``` +version() +``` + +#### Example + +```sql +> select version(); ++--------------------------------------------+ +| version() | ++--------------------------------------------+ +| Apache DataFusion 42.0.0, aarch64 on macos | ++--------------------------------------------+ +```