From 1981d95a50c871cb3d10e89891e4e877d3270648 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 14 Nov 2024 02:42:30 +0000 Subject: [PATCH] Add support for Utf8View to crypto functions #13406 --- datafusion/functions/src/crypto/basic.rs | 57 ++++++++++++++---- datafusion/functions/src/crypto/digest.rs | 1 + datafusion/functions/src/crypto/md5.rs | 4 +- datafusion/functions/src/crypto/sha224.rs | 2 +- datafusion/functions/src/crypto/sha256.rs | 2 +- datafusion/functions/src/crypto/sha384.rs | 2 +- datafusion/functions/src/crypto/sha512.rs | 2 +- datafusion/sqllogictest/test_files/expr.slt | 5 ++ .../test_files/string/string_view.slt | 60 +++++++++++++++++++ 9 files changed, 116 insertions(+), 19 deletions(-) diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs index 716afd84a9c9..74dc5d517c2b 100644 --- a/datafusion/functions/src/crypto/basic.rs +++ b/datafusion/functions/src/crypto/basic.rs @@ -17,17 +17,18 @@ //! "crypto" DataFusion functions -use arrow::array::StringArray; use arrow::array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait}; +use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray}; use arrow::datatypes::DataType; use blake2::{Blake2b512, Blake2s256, Digest}; use blake3::Hasher as Blake3; use datafusion_common::cast::as_binary_array; +use arrow::compute::StringArrayType; use datafusion_common::plan_err; use datafusion_common::{ - cast::{as_generic_binary_array, as_generic_string_array}, - exec_err, internal_err, DataFusionError, Result, ScalarValue, + cast::as_generic_binary_array, exec_err, internal_err, DataFusionError, Result, + ScalarValue, }; use datafusion_expr::ColumnarValue; use md5::Md5; @@ -121,9 +122,9 @@ pub fn digest(args: &[ColumnarValue]) -> Result { } let digest_algorithm = match &args[1] { ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => { - method.parse::() - } + ScalarValue::Utf8View(Some(method)) + | ScalarValue::Utf8(Some(method)) + | ScalarValue::LargeUtf8(Some(method)) => method.parse::(), other => exec_err!("Unsupported data type {other:?} for function digest"), }, ColumnarValue::Array(_) => { @@ -132,6 +133,7 @@ pub fn digest(args: &[ColumnarValue]) -> Result { }?; digest_process(&args[0], digest_algorithm) } + impl FromStr for DigestAlgorithm { type Err = DataFusionError; fn from_str(name: &str) -> Result { @@ -166,12 +168,14 @@ impl FromStr for DigestAlgorithm { }) } } + impl fmt::Display for DigestAlgorithm { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", format!("{self:?}").to_lowercase()) } } -// /// computes md5 hash digest of the given input + +/// computes md5 hash digest of the given input pub fn md5(args: &[ColumnarValue]) -> Result { if args.len() != 1 { return exec_err!( @@ -180,7 +184,9 @@ pub fn md5(args: &[ColumnarValue]) -> Result { DigestAlgorithm::Md5 ); } + let value = digest_process(&args[0], DigestAlgorithm::Md5)?; + // md5 requires special handling because of its unique utf8 return type Ok(match value { ColumnarValue::Array(array) => { @@ -214,7 +220,8 @@ pub fn utf8_or_binary_to_binary_type( name: &str, ) -> Result { Ok(match arg_type { - DataType::LargeUtf8 + DataType::Utf8View + | DataType::LargeUtf8 | DataType::Utf8 | DataType::Binary | DataType::LargeBinary => DataType::Binary, @@ -296,8 +303,30 @@ impl DigestAlgorithm { where T: OffsetSizeTrait, { - let input_value = as_generic_string_array::(value)?; - let array: ArrayRef = match self { + let array = match value.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + let v = value.as_string::(); + self.digest_utf8_array_impl::<&GenericStringArray>(v) + } + DataType::Utf8View => { + let v = value.as_string_view(); + self.digest_utf8_array_impl::<&StringViewArray>(v) + } + other => { + return exec_err!("unsupported type for digest_utf_array: {other:?}") + } + }; + Ok(ColumnarValue::Array(array)) + } + + pub fn digest_utf8_array_impl<'a, StringArrType>( + self, + input_value: StringArrType, + ) -> ArrayRef + where + StringArrType: StringArrayType<'a>, + { + match self { Self::Md5 => digest_to_array!(Md5, input_value), Self::Sha224 => digest_to_array!(Sha224, input_value), Self::Sha256 => digest_to_array!(Sha256, input_value), @@ -318,8 +347,7 @@ impl DigestAlgorithm { .collect(); Arc::new(binary_array) } - }; - Ok(ColumnarValue::Array(array)) + } } } pub fn digest_process( @@ -328,6 +356,7 @@ pub fn digest_process( ) -> Result { match value { ColumnarValue::Array(a) => match a.data_type() { + DataType::Utf8View => digest_algorithm.digest_utf8_array::(a.as_ref()), DataType::Utf8 => digest_algorithm.digest_utf8_array::(a.as_ref()), DataType::LargeUtf8 => digest_algorithm.digest_utf8_array::(a.as_ref()), DataType::Binary => digest_algorithm.digest_binary_array::(a.as_ref()), @@ -339,7 +368,9 @@ pub fn digest_process( ), }, ColumnarValue::Scalar(scalar) => match scalar { - ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => { + ScalarValue::Utf8View(a) + | ScalarValue::Utf8(a) + | ScalarValue::LargeUtf8(a) => { Ok(digest_algorithm .digest_scalar(a.as_ref().map(|s: &String| s.as_bytes()))) } diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index 0e43fb7785df..f738c6e3e40f 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -42,6 +42,7 @@ impl DigestFunc { Self { signature: Signature::one_of( vec![ + Exact(vec![Utf8View, Utf8View]), Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, Utf8]), Exact(vec![Binary, Utf8]), diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index 062d63bcc018..0f18fd47b4cf 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -42,7 +42,7 @@ impl Md5Func { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8, Binary, LargeBinary], + vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], Volatility::Immutable, ), } @@ -65,7 +65,7 @@ impl ScalarUDFImpl for Md5Func { use DataType::*; Ok(match &arg_types[0] { LargeUtf8 | LargeBinary => LargeUtf8, - Utf8 | Binary => Utf8, + Utf8View | Utf8 | Binary => Utf8, Null => Null, Dictionary(_, t) => match **t { LargeUtf8 | LargeBinary => LargeUtf8, diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 39202d5bf691..f0bfcb9fab3b 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -43,7 +43,7 @@ impl SHA224Func { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8, Binary, LargeBinary], + vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index 74deb3fc6caa..0a0044f72206 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -42,7 +42,7 @@ impl SHA256Func { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8, Binary, LargeBinary], + vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/crypto/sha384.rs b/datafusion/functions/src/crypto/sha384.rs index 9b1e1ba9ec3c..7f8220e5f9d5 100644 --- a/datafusion/functions/src/crypto/sha384.rs +++ b/datafusion/functions/src/crypto/sha384.rs @@ -42,7 +42,7 @@ impl SHA384Func { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8, Binary, LargeBinary], + vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs index c88579fd08ee..d2d51bfa53ab 100644 --- a/datafusion/functions/src/crypto/sha512.rs +++ b/datafusion/functions/src/crypto/sha512.rs @@ -42,7 +42,7 @@ impl SHA512Func { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8, Binary, LargeBinary], + vec![Utf8View, Utf8, LargeUtf8, Binary, LargeBinary], Volatility::Immutable, ), } diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index c653113fd438..15bf771c6527 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -2225,6 +2225,11 @@ SELECT digest('','blake3'); ---- af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 +# vverify utf8view +query ? +SELECT sha224(arrow_cast('tom', 'Utf8View')); +---- +0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d query T SELECT substring('alphabet', 1) diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 2f4af80a9257..9c03f0f25f71 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -954,6 +954,66 @@ logical_plan 01)Projection: nullif(test.column1_utf8view, test.column1_utf8view) AS c 02)--TableScan: test projection=[column1_utf8view] +## Ensure no casts for md5 +query TT +EXPLAIN SELECT + md5(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: md5(test.column1_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for sha224 +query TT +EXPLAIN SELECT + sha224(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: sha224(test.column1_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for sha256 +query TT +EXPLAIN SELECT + sha256(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: sha256(test.column1_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for sha384 +query TT +EXPLAIN SELECT + sha384(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: sha384(test.column1_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for sha512 +query TT +EXPLAIN SELECT + sha512(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: sha512(test.column1_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for digest +query TT +EXPLAIN SELECT + digest(column1_utf8view, 'md5') as c +FROM test; +---- +logical_plan +01)Projection: digest(test.column1_utf8view, Utf8View("md5")) AS c +02)--TableScan: test projection=[column1_utf8view] + ## Ensure no casts for binary operators # `~` operator (regex match) query TT