Skip to content

Commit

Permalink
move the Translate, SubstrIndex, FindInSet functions to datafusion-fu…
Browse files Browse the repository at this point in the history
…nctions (apache#9864)

* Fix to_timestamp benchmark

* Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started.

* Fixed missing trim() function.

* Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function

* move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions

* move strpos, substr functions to datafusion_functions

* move the Translate, SubstrIndex, FindInSet functions to new datafusion-functions crate

* Test code cleanup

* unicode_expressions Cargo.toml updates.

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
2 people authored and Lordworms committed Apr 1, 2024
1 parent 147cc7b commit ab88cac
Show file tree
Hide file tree
Showing 19 changed files with 510 additions and 422 deletions.
2 changes: 1 addition & 1 deletion datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ regex_expressions = [
serde = ["arrow-schema/serde"]
string_expressions = ["datafusion-functions/string_expressions"]
unicode_expressions = [
"datafusion-physical-expr/unicode_expressions",
"datafusion-optimizer/unicode_expressions",
"datafusion-sql/unicode_expressions",
"datafusion-functions/unicode_expressions",
]
Expand Down
40 changes: 0 additions & 40 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,6 @@ pub enum BuiltinScalarFunction {
InitCap,
/// random
Random,
/// translate
Translate,
/// substr_index
SubstrIndex,
/// find_in_set
FindInSet,
}

/// Maps the sql function name to `BuiltinScalarFunction`
Expand Down Expand Up @@ -198,9 +192,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::EndsWith => Volatility::Immutable,
BuiltinScalarFunction::InitCap => Volatility::Immutable,
BuiltinScalarFunction::Radians => Volatility::Immutable,
BuiltinScalarFunction::Translate => Volatility::Immutable,
BuiltinScalarFunction::SubstrIndex => Volatility::Immutable,
BuiltinScalarFunction::FindInSet => Volatility::Immutable,

// Volatile builtin functions
BuiltinScalarFunction::Random => Volatility::Volatile,
Expand Down Expand Up @@ -237,15 +228,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Pi => Ok(Float64),
BuiltinScalarFunction::Random => Ok(Float64),
BuiltinScalarFunction::EndsWith => Ok(Boolean),
BuiltinScalarFunction::SubstrIndex => {
utf8_to_str_type(&input_expr_types[0], "substr_index")
}
BuiltinScalarFunction::FindInSet => {
utf8_to_int_type(&input_expr_types[0], "find_in_set")
}
BuiltinScalarFunction::Translate => {
utf8_to_str_type(&input_expr_types[0], "translate")
}

BuiltinScalarFunction::Factorial
| BuiltinScalarFunction::Gcd
Expand Down Expand Up @@ -326,22 +308,6 @@ impl BuiltinScalarFunction {
],
self.volatility(),
),

BuiltinScalarFunction::SubstrIndex => Signature::one_of(
vec![
Exact(vec![Utf8, Utf8, Int64]),
Exact(vec![LargeUtf8, LargeUtf8, Int64]),
],
self.volatility(),
),
BuiltinScalarFunction::FindInSet => Signature::one_of(
vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])],
self.volatility(),
),

BuiltinScalarFunction::Translate => {
Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility())
}
BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()),
BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()),
BuiltinScalarFunction::Power => Signature::one_of(
Expand Down Expand Up @@ -492,9 +458,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
BuiltinScalarFunction::EndsWith => &["ends_with"],
BuiltinScalarFunction::InitCap => &["initcap"],
BuiltinScalarFunction::Translate => &["translate"],
BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"],
BuiltinScalarFunction::FindInSet => &["find_in_set"],
}
}
}
Expand Down Expand Up @@ -559,9 +522,6 @@ macro_rules! get_optimal_return_type {
// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size.
get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8);

// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size.
get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);

#[cfg(test)]
mod tests {
use super::*;
Expand Down
7 changes: 0 additions & 7 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,6 @@ scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");

scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase");
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`");
scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`");
nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL");
//there is a func concat_ws before, so use concat_ws_expr as name.c
nary_scalar_expr!(
Expand All @@ -593,9 +592,6 @@ scalar_expr!(
"returns true if a given number is +0.0 or -0.0 otherwise returns false"
);

scalar_expr!(SubstrIndex, substr_index, string delimiter count, "Returns the substring from str before count occurrences of the delimiter");
scalar_expr!(FindInSet, find_in_set, str strlist, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings");

/// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression.
pub fn case(expr: Expr) -> CaseBuilder {
CaseBuilder::new(Some(Box::new(expr)), vec![], vec![], None)
Expand Down Expand Up @@ -1006,8 +1002,5 @@ mod test {
test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
test_scalar_expr!(InitCap, initcap, string);
test_scalar_expr!(EndsWith, ends_with, string, characters);
test_scalar_expr!(Translate, translate, string, from, to);
test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count);
test_scalar_expr!(FindInSet, find_in_set, string, stringlist);
}
}
3 changes: 2 additions & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ regex_expressions = ["regex"]
# enable string functions
string_expressions = ["uuid"]
# enable unicode functions
unicode_expressions = ["unicode-segmentation"]
unicode_expressions = ["hashbrown", "unicode-segmentation"]

[lib]
name = "datafusion_functions"
Expand All @@ -72,6 +72,7 @@ datafusion-common = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-physical-expr = { workspace = true, default-features = true }
hashbrown = { version = "0.14", features = ["raw"], optional = true }
hex = { version = "0.4", optional = true }
itertools = { workspace = true }
log = { workspace = true }
Expand Down
119 changes: 119 additions & 0 deletions datafusion/functions/src/unicode/find_in_set.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::any::Any;
use std::sync::Arc;

use arrow::array::{
ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
};
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};

use datafusion_common::cast::as_generic_string_array;
use datafusion_common::{exec_err, Result};
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};

use crate::utils::{make_scalar_function, utf8_to_int_type};

#[derive(Debug)]
pub(super) struct FindInSetFunc {
signature: Signature,
}

impl FindInSetFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])],
Volatility::Immutable,
),
}
}
}

impl ScalarUDFImpl for FindInSetFunc {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"find_in_set"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
utf8_to_int_type(&arg_types[0], "find_in_set")
}

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => {
make_scalar_function(find_in_set::<Int32Type>, vec![])(args)
}
DataType::LargeUtf8 => {
make_scalar_function(find_in_set::<Int64Type>, vec![])(args)
}
other => {
exec_err!("Unsupported data type {other:?} for function find_in_set")
}
}
}
}

///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings
///A string list is a string composed of substrings separated by , characters.
pub fn find_in_set<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
where
T::Native: OffsetSizeTrait,
{
if args.len() != 2 {
return exec_err!(
"find_in_set was called with {} arguments. It requires 2.",
args.len()
);
}

let str_array: &GenericStringArray<T::Native> =
as_generic_string_array::<T::Native>(&args[0])?;
let str_list_array: &GenericStringArray<T::Native> =
as_generic_string_array::<T::Native>(&args[1])?;

let result = str_array
.iter()
.zip(str_list_array.iter())
.map(|(string, str_list)| match (string, str_list) {
(Some(string), Some(str_list)) => {
let mut res = 0;
let str_set: Vec<&str> = str_list.split(',').collect();
for (idx, str) in str_set.iter().enumerate() {
if str == &string {
res = idx + 1;
break;
}
}
T::Native::from_usize(res)
}
_ => None,
})
.collect::<PrimitiveArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
24 changes: 24 additions & 0 deletions datafusion/functions/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,33 @@ use std::sync::Arc;
use datafusion_expr::ScalarUDF;

mod character_length;
mod find_in_set;
mod left;
mod lpad;
mod reverse;
mod right;
mod rpad;
mod strpos;
mod substr;
mod substrindex;
mod translate;

// create UDFs
make_udf_function!(
character_length::CharacterLengthFunc,
CHARACTER_LENGTH,
character_length
);
make_udf_function!(find_in_set::FindInSetFunc, FIND_IN_SET, find_in_set);
make_udf_function!(left::LeftFunc, LEFT, left);
make_udf_function!(lpad::LPadFunc, LPAD, lpad);
make_udf_function!(right::RightFunc, RIGHT, right);
make_udf_function!(reverse::ReverseFunc, REVERSE, reverse);
make_udf_function!(rpad::RPadFunc, RPAD, rpad);
make_udf_function!(strpos::StrposFunc, STRPOS, strpos);
make_udf_function!(substr::SubstrFunc, SUBSTR, substr);
make_udf_function!(substrindex::SubstrIndexFunc, SUBSTR_INDEX, substr_index);
make_udf_function!(translate::TranslateFunc, TRANSLATE, translate);

pub mod expr_fn {
use datafusion_expr::Expr;
Expand All @@ -57,6 +63,11 @@ pub mod expr_fn {
super::character_length().call(vec![string])
}

#[doc = "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"]
pub fn find_in_set(string: Expr, strlist: Expr) -> Expr {
super::find_in_set().call(vec![string, strlist])
}

#[doc = "finds the position from where the `substring` matches the `string`"]
pub fn instr(string: Expr, substring: Expr) -> Expr {
strpos(string, substring)
Expand Down Expand Up @@ -111,18 +122,31 @@ pub mod expr_fn {
pub fn substring(string: Expr, position: Expr, length: Expr) -> Expr {
super::substr().call(vec![string, position, length])
}

#[doc = "Returns the substring from str before count occurrences of the delimiter"]
pub fn substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr {
super::substr_index().call(vec![string, delimiter, count])
}

#[doc = "replaces the characters in `from` with the counterpart in `to`"]
pub fn translate(string: Expr, from: Expr, to: Expr) -> Expr {
super::translate().call(vec![string, from, to])
}
}

/// Return a list of all functions in this package
pub fn functions() -> Vec<Arc<ScalarUDF>> {
vec![
character_length(),
find_in_set(),
left(),
lpad(),
reverse(),
right(),
rpad(),
strpos(),
substr(),
substr_index(),
translate(),
]
}
Loading

0 comments on commit ab88cac

Please sign in to comment.