From 2026df418d4977bedb8592f28d08568e166c67b3 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 14 Mar 2024 09:42:01 -0400 Subject: [PATCH 1/6] Fix to_timestamp benchmark --- datafusion/functions/benches/to_timestamp.rs | 173 ++++++++++--------- 1 file changed, 92 insertions(+), 81 deletions(-) diff --git a/datafusion/functions/benches/to_timestamp.rs b/datafusion/functions/benches/to_timestamp.rs index c83824526442..31d609dee9bc 100644 --- a/datafusion/functions/benches/to_timestamp.rs +++ b/datafusion/functions/benches/to_timestamp.rs @@ -17,97 +17,108 @@ extern crate criterion; +use std::sync::Arc; + +use arrow_array::builder::StringBuilder; +use arrow_array::ArrayRef; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use datafusion_expr::lit; -use datafusion_functions::expr_fn::to_timestamp; +use datafusion_expr::ColumnarValue; +use datafusion_functions::datetime::to_timestamp; fn criterion_benchmark(c: &mut Criterion) { c.bench_function("to_timestamp_no_formats", |b| { - let inputs = vec![ - lit("1997-01-31T09:26:56.123Z"), - lit("1997-01-31T09:26:56.123-05:00"), - lit("1997-01-31 09:26:56.123-05:00"), - lit("2023-01-01 04:05:06.789 -08"), - lit("1997-01-31T09:26:56.123"), - lit("1997-01-31 09:26:56.123"), - lit("1997-01-31 09:26:56"), - lit("1997-01-31 13:26:56"), - lit("1997-01-31 13:26:56+04:00"), - lit("1997-01-31"), - ]; + let mut inputs = StringBuilder::new(); + inputs.append_value("1997-01-31T09:26:56.123Z"); + inputs.append_value("1997-01-31T09:26:56.123-05:00"); + inputs.append_value("1997-01-31 09:26:56.123-05:00"); + inputs.append_value("2023-01-01 04:05:06.789 -08"); + inputs.append_value("1997-01-31T09:26:56.123"); + inputs.append_value("1997-01-31 09:26:56.123"); + inputs.append_value("1997-01-31 09:26:56"); + inputs.append_value("1997-01-31 13:26:56"); + inputs.append_value("1997-01-31 13:26:56+04:00"); + inputs.append_value("1997-01-31"); + + let string_array = ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef); + b.iter(|| { - for i in inputs.iter() { - black_box(to_timestamp(vec![i.clone()])); - } - }); + black_box( + to_timestamp() + .invoke(&[string_array.clone()]) + .expect("to_timestamp should work on valid values"), + ) + }) }); c.bench_function("to_timestamp_with_formats", |b| { - let mut inputs = vec![]; - let mut format1 = vec![]; - let mut format2 = vec![]; - let mut format3 = vec![]; - - inputs.push(lit("1997-01-31T09:26:56.123Z")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%dT%H:%M:%S%.f%Z")); - - inputs.push(lit("1997-01-31T09:26:56.123-05:00")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%dT%H:%M:%S%.f%z")); - - inputs.push(lit("1997-01-31 09:26:56.123-05:00")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S%.f%Z")); - - inputs.push(lit("2023-01-01 04:05:06.789 -08")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S%.f %#z")); - - inputs.push(lit("1997-01-31T09:26:56.123")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%dT%H:%M:%S%.f")); - - inputs.push(lit("1997-01-31 09:26:56.123")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S%.f")); - - inputs.push(lit("1997-01-31 09:26:56")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H:%M:%S")); - - inputs.push(lit("1997-01-31 092656")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H%M%S")); - - inputs.push(lit("1997-01-31 092656+04:00")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d %H%M%S%:z")); - - inputs.push(lit("Sun Jul 8 00:34:60 2001")); - format1.push(lit("%+")); - format2.push(lit("%c")); - format3.push(lit("%Y-%m-%d 00:00:00")); - + let mut inputs = StringBuilder::new(); + let mut format1_builder = StringBuilder::with_capacity(2, 10); + let mut format2_builder = StringBuilder::with_capacity(2, 10); + let mut format3_builder = StringBuilder::with_capacity(2, 10); + + inputs.append_value("1997-01-31T09:26:56.123Z"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z"); + + inputs.append_value("1997-01-31T09:26:56.123-05:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z"); + + inputs.append_value("1997-01-31 09:26:56.123-05:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z"); + + inputs.append_value("2023-01-01 04:05:06.789 -08"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z"); + + inputs.append_value("1997-01-31T09:26:56.123"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f"); + + inputs.append_value("1997-01-31 09:26:56.123"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f"); + + inputs.append_value("1997-01-31 09:26:56"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H:%M:%S"); + + inputs.append_value("1997-01-31 092656"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H%M%S"); + + inputs.append_value("1997-01-31 092656+04:00"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d %H%M%S%:z"); + + inputs.append_value("Sun Jul 8 00:34:60 2001"); + format1_builder.append_value("%+"); + format2_builder.append_value("%c"); + format3_builder.append_value("%Y-%m-%d 00:00:00"); + + let args = [ + ColumnarValue::Array(Arc::new(inputs.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format1_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format2_builder.finish()) as ArrayRef), + ColumnarValue::Array(Arc::new(format3_builder.finish()) as ArrayRef), + ]; b.iter(|| { - inputs.iter().enumerate().for_each(|(idx, i)| { - black_box(to_timestamp(vec![ - i.clone(), - format1.get(idx).unwrap().clone(), - format2.get(idx).unwrap().clone(), - format3.get(idx).unwrap().clone(), - ])); - }) + black_box( + to_timestamp() + .invoke(&args.clone()) + .expect("to_timestamp should work on valid values"), + ) }) }); } From 6a450b4fa2caa523cb42580c912f742dd1a1ed2b Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Mon, 18 Mar 2024 10:43:02 -0400 Subject: [PATCH 2/6] Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. --- docs/source/user-guide/example-usage.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docs/source/user-guide/example-usage.md b/docs/source/user-guide/example-usage.md index 1c5c8f49a16a..c5eefbdaf156 100644 --- a/docs/source/user-guide/example-usage.md +++ b/docs/source/user-guide/example-usage.md @@ -240,17 +240,11 @@ async fn main() -> datafusion::error::Result<()> { } ``` -Finally, in order to build with the `simd` optimization `cargo nightly` is required. - -```shell -rustup toolchain install nightly -``` - Based on the instruction set architecture you are building on you will want to configure the `target-cpu` as well, ideally with `native` or at least `avx2`. ```shell -RUSTFLAGS='-C target-cpu=native' cargo +nightly run --release +RUSTFLAGS='-C target-cpu=native' cargo run --release ``` ## Enable backtraces From a94a4f6c3317e8e952d34d968996fbd603cd0c2e Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 22 Mar 2024 22:20:35 -0400 Subject: [PATCH 3/6] Fixed missing trim() function. --- datafusion/functions/src/string/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datafusion/functions/src/string/mod.rs b/datafusion/functions/src/string/mod.rs index 63026092f39a..517869a25682 100644 --- a/datafusion/functions/src/string/mod.rs +++ b/datafusion/functions/src/string/mod.rs @@ -72,6 +72,11 @@ pub mod expr_fn { super::to_hex().call(vec![arg1]) } + #[doc = "Removes all characters, spaces by default, from both sides of a string"] + pub fn trim(args: Vec) -> Expr { + super::btrim().call(args) + } + #[doc = "Converts a string to uppercase."] pub fn upper(arg1: Expr) -> Expr { super::upper().call(vec![arg1]) From 695abb60bc9c9ade998c671a1de18d1cb5aff646 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 30 Mar 2024 16:42:53 -0400 Subject: [PATCH 4/6] Add benchmark for substr_index --- datafusion/functions/Cargo.toml | 6 +- datafusion/functions/benches/substr_index.rs | 102 +++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 datafusion/functions/benches/substr_index.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 425ac207c33e..933893492c2a 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -106,4 +106,8 @@ required-features = ["datetime_expressions"] [[bench]] harness = false name = "to_char" -required-features = ["datetime_expressions"] + +[[bench]] +harness = false +name = "substr_index" +required-features = ["unicode_expressions"] diff --git a/datafusion/functions/benches/substr_index.rs b/datafusion/functions/benches/substr_index.rs new file mode 100644 index 000000000000..71e66b941084 --- /dev/null +++ b/datafusion/functions/benches/substr_index.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int64Array, StringArray}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::distributions::{Alphanumeric, Uniform}; +use rand::prelude::Distribution; +use rand::Rng; + +use datafusion_expr::ColumnarValue; +use datafusion_functions::unicode::substr_index; + +struct Filter { + dist: Dist, + test: Test, +} + +impl Distribution for Filter +where + Dist: Distribution, + Test: Fn(&T) -> bool, +{ + fn sample(&self, rng: &mut R) -> T { + loop { + let x = self.dist.sample(rng); + if (self.test)(&x) { + return x; + } + } + } +} + +fn data() -> (StringArray, StringArray, Int64Array) { + let dist = Filter { + dist: Uniform::new(-4, 5), + test: |x: &i64| x != &0, + }; + let mut rng = rand::thread_rng(); + let mut strings: Vec = vec![]; + let mut delimiters: Vec = vec![]; + let mut counts: Vec = vec![]; + + for _ in 0..1000 { + let length = rng.gen_range(20..50); + let text: String = (&mut rng) + .sample_iter(&Alphanumeric) + .take(length) + .map(char::from) + .collect(); + let char = rng.gen_range(0..text.len()); + let delimiter = &text.chars().nth(char).unwrap(); + let count = rng.sample(&dist); + + strings.push(text); + delimiters.push(delimiter.to_string()); + counts.push(count); + } + + ( + StringArray::from(strings), + StringArray::from(delimiters), + Int64Array::from(counts), + ) +} + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("substr_index_array_array_1000", |b| { + let (strings, delimiters, counts) = data(); + let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef); + let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef); + let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef); + + b.iter(|| { + black_box( + substr_index() + .invoke(&[strings.clone(), delimiters.clone(), counts.clone()]) + .expect("substr_index should work on valid values"), + ) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From cb12420f2ba6e316f2a6306135a34dc416963830 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 30 Mar 2024 17:55:14 -0400 Subject: [PATCH 5/6] Add missing required-features --- datafusion/functions/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 933893492c2a..ef7d2c9b1892 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -106,6 +106,7 @@ required-features = ["datetime_expressions"] [[bench]] harness = false name = "to_char" +required-features = ["datetime_expressions"] [[bench]] harness = false From 0e033dbfbd964d74d4036df4ca1a545dabb79004 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 31 Mar 2024 10:20:01 -0400 Subject: [PATCH 6/6] Update datafusion/functions/benches/substr_index.rs Co-authored-by: Andrew Lamb --- datafusion/functions/benches/substr_index.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/benches/substr_index.rs b/datafusion/functions/benches/substr_index.rs index 71e66b941084..bb9a5b809eee 100644 --- a/datafusion/functions/benches/substr_index.rs +++ b/datafusion/functions/benches/substr_index.rs @@ -88,10 +88,11 @@ fn criterion_benchmark(c: &mut Criterion) { let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef); let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef); + let args = [strings, delimiters, counts]; b.iter(|| { black_box( substr_index() - .invoke(&[strings.clone(), delimiters.clone(), counts.clone()]) + .invoke(&args) .expect("substr_index should work on valid values"), ) })