From 8a88fdcc23e0c09c51e9c6285d78ffe89bf5935f Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Wed, 13 Nov 2024 14:24:42 -0500 Subject: [PATCH 01/17] update docs (#13395) --- dev/update_function_docs.sh | 10 +- docs/source/user-guide/sql/index.rst | 1 - .../source/user-guide/sql/window_functions.md | 122 +++++++- .../user-guide/sql/window_functions_new.md | 290 ------------------ 4 files changed, 110 insertions(+), 313 deletions(-) delete mode 100644 docs/source/user-guide/sql/window_functions_new.md diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh index ad3bc9c7f69c..205ab41984a5 100755 --- a/dev/update_function_docs.sh +++ b/dev/update_function_docs.sh @@ -113,7 +113,7 @@ npx prettier@2.3.2 --write "$TARGET_FILE" echo "'$TARGET_FILE' successfully updated!" -TARGET_FILE="docs/source/user-guide/sql/window_functions_new.md" +TARGET_FILE="docs/source/user-guide/sql/window_functions.md" PRINT_WINDOW_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- window" echo "Inserting header" @@ -146,13 +146,7 @@ dev/update_function_docs.sh file for updating surrounding text. --> -# Window Functions (NEW) - -Note: this documentation is in the process of being migrated to be [automatically created from the codebase]. -Please see the [Window Functions (Old)](window_functions.md) page for -the rest of the documentation. - -[automatically created from the codebase]: https://github.com/apache/datafusion/issues/12740 +# Window Functions A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst index 4499aac53611..0508fa12f0f3 100644 --- a/docs/source/user-guide/sql/index.rst +++ b/docs/source/user-guide/sql/index.rst @@ -31,7 +31,6 @@ SQL Reference operators aggregate_functions window_functions - window_functions_new scalar_functions special_functions sql_status diff --git a/docs/source/user-guide/sql/window_functions.md b/docs/source/user-guide/sql/window_functions.md index 8216a3b258b8..a68fdbda6709 100644 --- a/docs/source/user-guide/sql/window_functions.md +++ b/docs/source/user-guide/sql/window_functions.md @@ -12,22 +12,25 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either expressioness or implied. See the License for the + KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> + + # Window Functions A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. - -Note: this documentation is in the process of being migrated to be [automatically created from the codebase]. -Please see the [Window Functions (new)](window_functions_new.md) page for -the rest of the documentation. - -[automatically created from the codebase]: https://github.com/apache/datafusion/issues/12740 - -Window functions are comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result +This is comparable to the type of calculation that can be done with an aggregate function. +However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. +Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result Here is an example that shows how to compare each employee's salary with the average salary in his or her department: @@ -146,17 +149,80 @@ RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must All [aggregate functions](aggregate_functions.md) can be used as window functions. -## Analytical functions +## Ranking Functions + +- [cume_dist](#cume_dist) +- [dense_rank](#dense_rank) +- [ntile](#ntile) +- [percent_rank](#percent_rank) +- [rank](#rank) +- [row_number](#row_number) + +### `cume_dist` + +Relative rank of the current row: (number of rows preceding or peer with current row) / (total rows). + +``` +cume_dist() +``` + +### `dense_rank` + +Returns the rank of the current row without gaps. This function ranks rows in a dense manner, meaning consecutive ranks are assigned even for identical values. + +``` +dense_rank() +``` + +### `ntile` + +Integer ranging from 1 to the argument value, dividing the partition as equally as possible + +``` +ntile(expression) +``` + +#### Arguments + +- **expression**: An integer describing the number groups the partition should be split into + +### `percent_rank` + +Returns the percentage rank of the current row within its partition. The value ranges from 0 to 1 and is computed as `(rank - 1) / (total_rows - 1)`. + +``` +percent_rank() +``` + +### `rank` + +Returns the rank of the current row within its partition, allowing gaps between ranks. This function provides a ranking similar to `row_number`, but skips ranks for identical values. + +``` +rank() +``` + +### `row_number` + +Number of the current row within its partition, counting from 1. + +``` +row_number() +``` + +## Analytical Functions - [first_value](#first_value) +- [lag](#lag) - [last_value](#last_value) +- [lead](#lead) - [nth_value](#nth_value) ### `first_value` Returns value evaluated at the row that is the first row of the window frame. -```sql +``` first_value(expression) ``` @@ -164,11 +230,25 @@ first_value(expression) - **expression**: Expression to operate on +### `lag` + +Returns value evaluated at the row that is offset rows before the current row within the partition; if there is no such row, instead return default (which must be of the same type as value). + +``` +lag(expression, offset, default) +``` + +#### Arguments + +- **expression**: Expression to operate on +- **offset**: Integer. Specifies how many rows back the value of expression should be retrieved. Defaults to 1. +- **default**: The default value if the offset is not within the partition. Must be of the same type as expression. + ### `last_value` Returns value evaluated at the row that is the last row of the window frame. -```sql +``` last_value(expression) ``` @@ -176,15 +256,29 @@ last_value(expression) - **expression**: Expression to operate on +### `lead` + +Returns value evaluated at the row that is offset rows after the current row within the partition; if there is no such row, instead return default (which must be of the same type as value). + +``` +lead(expression, offset, default) +``` + +#### Arguments + +- **expression**: Expression to operate on +- **offset**: Integer. Specifies how many rows forward the value of expression should be retrieved. Defaults to 1. +- **default**: The default value if the offset is not within the partition. Must be of the same type as expression. + ### `nth_value` Returns value evaluated at the row that is the nth row of the window frame (counting from 1); null if no such row. -```sql +``` nth_value(expression, n) ``` #### Arguments - **expression**: The name the column of which nth value to retrieve -- **n**: Integer. Specifies the _n_ in nth +- **n**: Integer. Specifies the n in nth diff --git a/docs/source/user-guide/sql/window_functions_new.md b/docs/source/user-guide/sql/window_functions_new.md deleted file mode 100644 index 1727dececbeb..000000000000 --- a/docs/source/user-guide/sql/window_functions_new.md +++ /dev/null @@ -1,290 +0,0 @@ - - - - -# Window Functions (NEW) - -Note: this documentation is in the process of being migrated to be [automatically created from the codebase]. -Please see the [Window Functions (Old)](window_functions.md) page for -the rest of the documentation. - -[automatically created from the codebase]: https://github.com/apache/datafusion/issues/12740 - -A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. -This is comparable to the type of calculation that can be done with an aggregate function. -However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. -Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result - -Here is an example that shows how to compare each employee's salary with the average salary in his or her department: - -```sql -SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary; - -+-----------+-------+--------+-------------------+ -| depname | empno | salary | avg | -+-----------+-------+--------+-------------------+ -| personnel | 2 | 3900 | 3700.0 | -| personnel | 5 | 3500 | 3700.0 | -| develop | 8 | 6000 | 5020.0 | -| develop | 10 | 5200 | 5020.0 | -| develop | 11 | 5200 | 5020.0 | -| develop | 9 | 4500 | 5020.0 | -| develop | 7 | 4200 | 5020.0 | -| sales | 1 | 5000 | 4866.666666666667 | -| sales | 4 | 4800 | 4866.666666666667 | -| sales | 3 | 4800 | 4866.666666666667 | -+-----------+-------+--------+-------------------+ -``` - -A window function call always contains an OVER clause directly following the window function's name and argument(s). This is what syntactically distinguishes it from a normal function or non-window aggregate. The OVER clause determines exactly how the rows of the query are split up for processing by the window function. The PARTITION BY clause within OVER divides the rows into groups, or partitions, that share the same values of the PARTITION BY expression(s). For each row, the window function is computed across the rows that fall into the same partition as the current row. The previous example showed how to count the average of a column per partition. - -You can also control the order in which rows are processed by window functions using ORDER BY within OVER. (The window ORDER BY does not even have to match the order in which the rows are output.) Here is an example: - -```sql -SELECT depname, empno, salary, - rank() OVER (PARTITION BY depname ORDER BY salary DESC) -FROM empsalary; - -+-----------+-------+--------+--------+ -| depname | empno | salary | rank | -+-----------+-------+--------+--------+ -| personnel | 2 | 3900 | 1 | -| develop | 8 | 6000 | 1 | -| develop | 10 | 5200 | 2 | -| develop | 11 | 5200 | 2 | -| develop | 9 | 4500 | 4 | -| develop | 7 | 4200 | 5 | -| sales | 1 | 5000 | 1 | -| sales | 4 | 4800 | 2 | -| personnel | 5 | 3500 | 2 | -| sales | 3 | 4800 | 2 | -+-----------+-------+--------+--------+ -``` - -There is another important concept associated with window functions: for each row, there is a set of rows within its partition called its window frame. Some window functions act only on the rows of the window frame, rather than of the whole partition. Here is an example of using window frames in queries: - -```sql -SELECT depname, empno, salary, - avg(salary) OVER(ORDER BY salary ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg, - min(salary) OVER(ORDER BY empno ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_min -FROM empsalary -ORDER BY empno ASC; - -+-----------+-------+--------+--------------------+---------+ -| depname | empno | salary | avg | cum_min | -+-----------+-------+--------+--------------------+---------+ -| sales | 1 | 5000 | 5000.0 | 5000 | -| personnel | 2 | 3900 | 3866.6666666666665 | 3900 | -| sales | 3 | 4800 | 4700.0 | 3900 | -| sales | 4 | 4800 | 4866.666666666667 | 3900 | -| personnel | 5 | 3500 | 3700.0 | 3500 | -| develop | 7 | 4200 | 4200.0 | 3500 | -| develop | 8 | 6000 | 5600.0 | 3500 | -| develop | 9 | 4500 | 4500.0 | 3500 | -| develop | 10 | 5200 | 5133.333333333333 | 3500 | -| develop | 11 | 5200 | 5466.666666666667 | 3500 | -+-----------+-------+--------+--------------------+---------+ -``` - -When a query involves multiple window functions, it is possible to write out each one with a separate OVER clause, but this is duplicative and error-prone if the same windowing behavior is wanted for several functions. Instead, each windowing behavior can be named in a WINDOW clause and then referenced in OVER. For example: - -```sql -SELECT sum(salary) OVER w, avg(salary) OVER w -FROM empsalary -WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); -``` - -## Syntax - -The syntax for the OVER-clause is - -``` -function([expr]) - OVER( - [PARTITION BY expr[, …]] - [ORDER BY expr [ ASC | DESC ][, …]] - [ frame_clause ] - ) -``` - -where **frame_clause** is one of: - -``` - { RANGE | ROWS | GROUPS } frame_start - { RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end -``` - -and **frame_start** and **frame_end** can be one of - -```sql -UNBOUNDED PRECEDING -offset PRECEDING -CURRENT ROW -offset FOLLOWING -UNBOUNDED FOLLOWING -``` - -where **offset** is an non-negative integer. - -RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column). - -## Aggregate functions - -All [aggregate functions](aggregate_functions.md) can be used as window functions. - -## Ranking Functions - -- [cume_dist](#cume_dist) -- [dense_rank](#dense_rank) -- [ntile](#ntile) -- [percent_rank](#percent_rank) -- [rank](#rank) -- [row_number](#row_number) - -### `cume_dist` - -Relative rank of the current row: (number of rows preceding or peer with current row) / (total rows). - -``` -cume_dist() -``` - -### `dense_rank` - -Returns the rank of the current row without gaps. This function ranks rows in a dense manner, meaning consecutive ranks are assigned even for identical values. - -``` -dense_rank() -``` - -### `ntile` - -Integer ranging from 1 to the argument value, dividing the partition as equally as possible - -``` -ntile(expression) -``` - -#### Arguments - -- **expression**: An integer describing the number groups the partition should be split into - -### `percent_rank` - -Returns the percentage rank of the current row within its partition. The value ranges from 0 to 1 and is computed as `(rank - 1) / (total_rows - 1)`. - -``` -percent_rank() -``` - -### `rank` - -Returns the rank of the current row within its partition, allowing gaps between ranks. This function provides a ranking similar to `row_number`, but skips ranks for identical values. - -``` -rank() -``` - -### `row_number` - -Number of the current row within its partition, counting from 1. - -``` -row_number() -``` - -## Analytical Functions - -- [first_value](#first_value) -- [lag](#lag) -- [last_value](#last_value) -- [lead](#lead) -- [nth_value](#nth_value) - -### `first_value` - -Returns value evaluated at the row that is the first row of the window frame. - -``` -first_value(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on - -### `lag` - -Returns value evaluated at the row that is offset rows before the current row within the partition; if there is no such row, instead return default (which must be of the same type as value). - -``` -lag(expression, offset, default) -``` - -#### Arguments - -- **expression**: Expression to operate on -- **offset**: Integer. Specifies how many rows back the value of expression should be retrieved. Defaults to 1. -- **default**: The default value if the offset is not within the partition. Must be of the same type as expression. - -### `last_value` - -Returns value evaluated at the row that is the last row of the window frame. - -``` -last_value(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on - -### `lead` - -Returns value evaluated at the row that is offset rows after the current row within the partition; if there is no such row, instead return default (which must be of the same type as value). - -``` -lead(expression, offset, default) -``` - -#### Arguments - -- **expression**: Expression to operate on -- **offset**: Integer. Specifies how many rows forward the value of expression should be retrieved. Defaults to 1. -- **default**: The default value if the offset is not within the partition. Must be of the same type as expression. - -### `nth_value` - -Returns value evaluated at the row that is the nth row of the window frame (counting from 1); null if no such row. - -``` -nth_value(expression, n) -``` - -#### Arguments - -- **expression**: The name the column of which nth value to retrieve -- **n**: Integer. Specifies the n in nth From e041b6a37a9458dae73743a33ecb11a4f92fec5c Mon Sep 17 00:00:00 2001 From: Oleks V Date: Wed, 13 Nov 2024 12:42:11 -0800 Subject: [PATCH 02/17] Minor: SortMergeJoin small refactoring (#13398) * SortMergeJoin: small refactoring --- .../src/joins/sort_merge_join.rs | 186 +++++++++--------- 1 file changed, 96 insertions(+), 90 deletions(-) diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 9307caf1c6ad..a01cd348f0c5 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -371,7 +371,7 @@ impl ExecutionPlan for SortMergeJoinExec { .register(context.memory_pool()); // create join stream - Ok(Box::pin(SMJStream::try_new( + Ok(Box::pin(SortMergeJoinStream::try_new( Arc::clone(&self.schema), self.sort_options.clone(), self.null_equals_null, @@ -461,7 +461,7 @@ impl SortMergeJoinMetrics { /// State of SMJ stream #[derive(Debug, PartialEq, Eq)] -enum SMJState { +enum SortMergeJoinState { /// Init joining with a new streamed row or a new buffered batches Init, /// Polling one streamed row or one buffered batch, or both @@ -597,11 +597,11 @@ struct BufferedBatch { pub null_joined: Vec, /// Size estimation used for reserving / releasing memory pub size_estimation: usize, - /// The indices of buffered batch that failed the join filter. - /// This is a map between buffered row index and a boolean value indicating whether all joined row - /// of the buffered row failed the join filter. + /// The indices of buffered batch that the join filter doesn't satisfy. + /// This is a map between right row index and a boolean value indicating whether all joined row + /// of the right row does not satisfy the filter . /// When dequeuing the buffered batch, we need to produce null joined rows for these indices. - pub join_filter_failed_map: HashMap, + pub join_filter_not_matched_map: HashMap, /// Current buffered batch number of rows. Equal to batch.num_rows() /// but if batch is spilled to disk this property is preferable /// and less expensive @@ -642,7 +642,7 @@ impl BufferedBatch { join_arrays, null_joined: vec![], size_estimation, - join_filter_failed_map: HashMap::new(), + join_filter_not_matched_map: HashMap::new(), num_rows, spill_file: None, } @@ -651,9 +651,9 @@ impl BufferedBatch { /// Sort-merge join stream that consumes streamed and buffered data stream /// and produces joined output -struct SMJStream { +struct SortMergeJoinStream { /// Current state of the stream - pub state: SMJState, + pub state: SortMergeJoinState, /// Output schema pub schema: SchemaRef, /// Sort options of join columns used to sort streamed and buffered data stream @@ -722,7 +722,15 @@ struct JoinedRecordBatches { pub batch_ids: Vec, } -impl RecordBatchStream for SMJStream { +impl JoinedRecordBatches { + fn clear(&mut self) { + self.batches.clear(); + self.batch_ids.clear(); + self.filter_mask = BooleanBuilder::new(); + self.row_indices = UInt64Builder::new(); + } +} +impl RecordBatchStream for SortMergeJoinStream { fn schema(&self) -> SchemaRef { Arc::clone(&self.schema) } @@ -865,6 +873,7 @@ fn get_corrected_filter_mask( let is_null = filter_mask.is_null(i); if val { + // memoize the first seen matched row if !seen_true { last_true_idx = i; } @@ -884,6 +893,8 @@ fn get_corrected_filter_mask( } if last_index { + // If the left row seen as true its needed to output it once + // To do that we mark all other matches for same row as null to avoid the output if seen_true { #[allow(clippy::needless_range_loop)] for j in first_row_idx..last_true_idx { @@ -905,7 +916,7 @@ fn get_corrected_filter_mask( } } -impl Stream for SMJStream { +impl Stream for SortMergeJoinStream { type Item = Result; fn poll_next( @@ -916,13 +927,13 @@ impl Stream for SMJStream { let _timer = join_time.timer(); loop { match &self.state { - SMJState::Init => { + SortMergeJoinState::Init => { let streamed_exhausted = self.streamed_state == StreamedState::Exhausted; let buffered_exhausted = self.buffered_state == BufferedState::Exhausted; self.state = if streamed_exhausted && buffered_exhausted { - SMJState::Exhausted + SortMergeJoinState::Exhausted } else { match self.current_ordering { Ordering::Less | Ordering::Equal => { @@ -961,10 +972,10 @@ impl Stream for SMJStream { } } } - SMJState::Polling + SortMergeJoinState::Polling }; } - SMJState::Polling => { + SortMergeJoinState::Polling => { if ![StreamedState::Exhausted, StreamedState::Ready] .contains(&self.streamed_state) { @@ -987,19 +998,19 @@ impl Stream for SMJStream { let buffered_exhausted = self.buffered_state == BufferedState::Exhausted; if streamed_exhausted && buffered_exhausted { - self.state = SMJState::Exhausted; + self.state = SortMergeJoinState::Exhausted; continue; } self.current_ordering = self.compare_streamed_buffered()?; - self.state = SMJState::JoinOutput; + self.state = SortMergeJoinState::JoinOutput; } - SMJState::JoinOutput => { + SortMergeJoinState::JoinOutput => { self.join_partial()?; if self.output_size < self.batch_size { if self.buffered_data.scanning_finished() { self.buffered_data.scanning_reset(); - self.state = SMJState::Init; + self.state = SortMergeJoinState::Init; } } else { self.freeze_all()?; @@ -1029,7 +1040,7 @@ impl Stream for SMJStream { return Poll::Pending; } } - SMJState::Exhausted => { + SortMergeJoinState::Exhausted => { self.freeze_all()?; if !self.output_record_batches.batches.is_empty() { @@ -1059,7 +1070,7 @@ impl Stream for SMJStream { } } -impl SMJStream { +impl SortMergeJoinStream { #[allow(clippy::too_many_arguments)] pub fn try_new( schema: SchemaRef, @@ -1079,7 +1090,7 @@ impl SMJStream { let streamed_schema = streamed.schema(); let buffered_schema = buffered.schema(); Ok(Self { - state: SMJState::Init, + state: SortMergeJoinState::Init, sort_options, null_equals_null, schema, @@ -1517,7 +1528,7 @@ impl SMJStream { // For buffered row which is joined with streamed side rows but all joined rows // don't satisfy the join filter let not_matched_buffered_indices = buffered_batch - .join_filter_failed_map + .join_filter_not_matched_map .iter() .filter_map(|(idx, failed)| if *failed { Some(*idx) } else { None }) .collect::>(); @@ -1531,7 +1542,6 @@ impl SMJStream { &buffered_indices, buffered_batch, )? { - //print_batches(&[record_batch.clone()]); let num_rows = record_batch.num_rows(); self.output_record_batches @@ -1545,8 +1555,7 @@ impl SMJStream { .extend(vec![0; num_rows]); self.output_record_batches.batches.push(record_batch); } - //dbg!(&buffered_batch.join_filter_failed_map); - buffered_batch.join_filter_failed_map.clear(); + buffered_batch.join_filter_not_matched_map.clear(); Ok(()) } @@ -1556,31 +1565,31 @@ impl SMJStream { fn freeze_streamed(&mut self) -> Result<()> { for chunk in self.streamed_batch.output_indices.iter_mut() { // The row indices of joined streamed batch - let streamed_indices = chunk.streamed_indices.finish(); + let left_indices = chunk.streamed_indices.finish(); - if streamed_indices.is_empty() { + if left_indices.is_empty() { continue; } - let mut streamed_columns = self + let mut left_columns = self .streamed_batch .batch .columns() .iter() - .map(|column| take(column, &streamed_indices, None)) + .map(|column| take(column, &left_indices, None)) .collect::, ArrowError>>()?; // The row indices of joined buffered batch - let buffered_indices: UInt64Array = chunk.buffered_indices.finish(); - let mut buffered_columns = if matches!(self.join_type, JoinType::LeftMark) { - vec![Arc::new(is_not_null(&buffered_indices)?) as ArrayRef] + let right_indices: UInt64Array = chunk.buffered_indices.finish(); + let mut right_columns = if matches!(self.join_type, JoinType::LeftMark) { + vec![Arc::new(is_not_null(&right_indices)?) as ArrayRef] } else if matches!(self.join_type, JoinType::LeftSemi | JoinType::LeftAnti) { vec![] } else if let Some(buffered_idx) = chunk.buffered_batch_idx { - get_buffered_columns( + fetch_right_columns_by_idxs( &self.buffered_data, buffered_idx, - &buffered_indices, + &right_indices, )? } else { // If buffered batch none, meaning it is null joined batch. @@ -1588,29 +1597,30 @@ impl SMJStream { create_unmatched_columns( self.join_type, &self.buffered_schema, - buffered_indices.len(), + right_indices.len(), ) }; // Prepare the columns we apply join filter on later. // Only for joined rows between streamed and buffered. let filter_columns = if chunk.buffered_batch_idx.is_some() { - if matches!(self.join_type, JoinType::Right) { - get_filter_column(&self.filter, &buffered_columns, &streamed_columns) - } else if matches!( - self.join_type, - JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark - ) { - // unwrap is safe here as we check is_some on top of if statement - let buffered_columns = get_buffered_columns( - &self.buffered_data, - chunk.buffered_batch_idx.unwrap(), - &buffered_indices, - )?; + if !matches!(self.join_type, JoinType::Right) { + if matches!( + self.join_type, + JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark + ) { + let right_cols = fetch_right_columns_by_idxs( + &self.buffered_data, + chunk.buffered_batch_idx.unwrap(), + &right_indices, + )?; - get_filter_column(&self.filter, &streamed_columns, &buffered_columns) + get_filter_column(&self.filter, &left_columns, &right_cols) + } else { + get_filter_column(&self.filter, &left_columns, &right_columns) + } } else { - get_filter_column(&self.filter, &streamed_columns, &buffered_columns) + get_filter_column(&self.filter, &right_columns, &left_columns) } } else { // This chunk is totally for null joined rows (outer join), we don't need to apply join filter. @@ -1618,12 +1628,12 @@ impl SMJStream { vec![] }; - let columns = if matches!(self.join_type, JoinType::Right) { - buffered_columns.extend(streamed_columns); - buffered_columns + let columns = if !matches!(self.join_type, JoinType::Right) { + left_columns.extend(right_columns); + left_columns } else { - streamed_columns.extend(buffered_columns); - streamed_columns + right_columns.extend(left_columns); + right_columns }; let output_batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?; @@ -1665,9 +1675,7 @@ impl SMJStream { | JoinType::LeftMark | JoinType::Full ) { - self.output_record_batches - .batches - .push(output_batch.clone()); + self.output_record_batches.batches.push(output_batch); } else { let filtered_batch = filter_record_batch(&output_batch, &mask)?; self.output_record_batches.batches.push(filtered_batch); @@ -1678,12 +1686,10 @@ impl SMJStream { } else { self.output_record_batches.filter_mask.extend(pre_mask); } - self.output_record_batches - .row_indices - .extend(&streamed_indices); + self.output_record_batches.row_indices.extend(&left_indices); self.output_record_batches.batch_ids.extend(vec![ - self.streamed_batch_counter.load(Relaxed); - streamed_indices.len() + self.streamed_batch_counter.load(Relaxed); + left_indices.len() ]); // For outer joins, we need to push the null joined rows to the output if @@ -1697,16 +1703,16 @@ impl SMJStream { for i in 0..pre_mask.len() { // If the buffered row is not joined with streamed side, // skip it. - if buffered_indices.is_null(i) { + if right_indices.is_null(i) { continue; } - let buffered_index = buffered_indices.value(i); + let buffered_index = right_indices.value(i); - buffered_batch.join_filter_failed_map.insert( + buffered_batch.join_filter_not_matched_map.insert( buffered_index, *buffered_batch - .join_filter_failed_map + .join_filter_not_matched_map .get(&buffered_index) .unwrap_or(&true) && !pre_mask.value(i), @@ -1765,6 +1771,9 @@ impl SMJStream { let mut batch_ids = &self.output_record_batches.batch_ids; let default_batch_ids = vec![0; record_batch.num_rows()]; + // If only nulls come in and indices sizes doesn't match with expected record batch count + // generate missing indices + // Happens for null joined batches for Full Join if out_indices.null_count() == out_indices.len() && out_indices.len() != record_batch.num_rows() { @@ -1804,32 +1813,32 @@ impl SMJStream { let null_mask = compute::not(corrected_mask)?; let null_joined_batch = filter_record_batch(&record_batch, &null_mask)?; - let mut buffered_columns = create_unmatched_columns( + let mut right_columns = create_unmatched_columns( self.join_type, &self.buffered_schema, null_joined_batch.num_rows(), ); - let columns = if matches!(self.join_type, JoinType::Right) { - let streamed_columns = null_joined_batch + let columns = if !matches!(self.join_type, JoinType::Right) { + let mut left_columns = null_joined_batch .columns() .iter() - .skip(left_columns_length) + .take(right_columns_length) .cloned() .collect::>(); - buffered_columns.extend(streamed_columns); - buffered_columns + left_columns.extend(right_columns); + left_columns } else { - let mut streamed_columns = null_joined_batch + let left_columns = null_joined_batch .columns() .iter() - .take(right_columns_length) + .skip(left_columns_length) .cloned() .collect::>(); - streamed_columns.extend(buffered_columns); - streamed_columns + right_columns.extend(left_columns); + right_columns }; // Push the streamed/buffered batch joined nulls to the output @@ -1905,10 +1914,7 @@ impl SMJStream { )?; } - self.output_record_batches.batches.clear(); - self.output_record_batches.batch_ids.clear(); - self.output_record_batches.filter_mask = BooleanBuilder::new(); - self.output_record_batches.row_indices = UInt64Builder::new(); + self.output_record_batches.clear(); Ok(filtered_record_batch) } @@ -1971,39 +1977,39 @@ fn produce_buffered_null_batch( } // Take buffered (right) columns - let buffered_columns = - get_buffered_columns_from_batch(buffered_batch, buffered_indices)?; + let right_columns = + fetch_right_columns_from_batch_by_idxs(buffered_batch, buffered_indices)?; // Create null streamed (left) columns - let mut streamed_columns = streamed_schema + let mut left_columns = streamed_schema .fields() .iter() .map(|f| new_null_array(f.data_type(), buffered_indices.len())) .collect::>(); - streamed_columns.extend(buffered_columns); + left_columns.extend(right_columns); Ok(Some(RecordBatch::try_new( Arc::clone(schema), - streamed_columns, + left_columns, )?)) } -/// Get `buffered_indices` rows for `buffered_data[buffered_batch_idx]` +/// Get `buffered_indices` rows for `buffered_data[buffered_batch_idx]` by specific column indices #[inline(always)] -fn get_buffered_columns( +fn fetch_right_columns_by_idxs( buffered_data: &BufferedData, buffered_batch_idx: usize, buffered_indices: &UInt64Array, ) -> Result> { - get_buffered_columns_from_batch( + fetch_right_columns_from_batch_by_idxs( &buffered_data.batches[buffered_batch_idx], buffered_indices, ) } #[inline(always)] -fn get_buffered_columns_from_batch( +fn fetch_right_columns_from_batch_by_idxs( buffered_batch: &BufferedBatch, buffered_indices: &UInt64Array, ) -> Result> { From 042843ad249f8f9a03fa7b7a42c7e60db7c478b1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Nov 2024 15:43:27 -0500 Subject: [PATCH 03/17] Minor: Add SQL example for `date_bin` (#13390) --- datafusion/functions/src/datetime/date_bin.rs | 24 +++++++++++++++++ .../source/user-guide/sql/scalar_functions.md | 26 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index 065201e1caa3..82481f9fff64 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -181,6 +181,30 @@ Calculates time intervals and returns the start of the interval nearest to the s For example, if you "bin" or "window" data into 15 minute intervals, an input timestamp of `2023-01-01T18:18:18Z` will be updated to the start time of the 15 minute bin it is in: `2023-01-01T18:15:00Z`. "#) .with_syntax_example("date_bin(interval, expression, origin-timestamp)") + .with_sql_example(r#"```sql +-- Bin the timestamp into 1 day intervals +> SELECT date_bin(interval '1 day', time) as bin +FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); ++---------------------+ +| bin | ++---------------------+ +| 2023-01-01T00:00:00 | +| 2023-01-03T00:00:00 | ++---------------------+ +2 row(s) fetched. + +-- Bin the timestamp into 1 day intervals starting at 3AM on 2023-01-01 +> SELECT date_bin(interval '1 day', time, '2023-01-01T03:00:00') as bin +FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); ++---------------------+ +| bin | ++---------------------+ +| 2023-01-01T03:00:00 | +| 2023-01-03T03:00:00 | ++---------------------+ +2 row(s) fetched. +``` +"#) .with_argument("interval", "Bin interval.") .with_argument("expression", "Time expression to operate on. Can be a constant, column, or function.") .with_argument("origin-timestamp", "Optional. Starting point used to determine bin boundaries. If not specified defaults 1970-01-01T00:00:00Z (the UNIX epoch in UTC). diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index e9cd2bba7d11..74e7285b76e9 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1954,6 +1954,32 @@ The following intervals are supported: - years - century +#### Example + +```sql +-- Bin the timestamp into 1 day intervals +> SELECT date_bin(interval '1 day', time) as bin +FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); ++---------------------+ +| bin | ++---------------------+ +| 2023-01-01T00:00:00 | +| 2023-01-03T00:00:00 | ++---------------------+ +2 row(s) fetched. + +-- Bin the timestamp into 1 day intervals starting at 3AM on 2023-01-01 +> SELECT date_bin(interval '1 day', time, '2023-01-01T03:00:00') as bin +FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); ++---------------------+ +| bin | ++---------------------+ +| 2023-01-01T03:00:00 | +| 2023-01-03T03:00:00 | ++---------------------+ +2 row(s) fetched. +``` + ### `date_format` _Alias of [to_char](#to_char)._ From ccf6258a1e02eb01af436f78c7f6430be19fa59c Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Thu, 14 Nov 2024 07:31:49 +0800 Subject: [PATCH 04/17] Add Utf8View to `TypeCategory::Unknown` (#13350) * add utf8view Signed-off-by: jayzhan211 * add test Signed-off-by: jayzhan211 * fix comment Signed-off-by: Jay Zhan --------- Signed-off-by: jayzhan211 Signed-off-by: Jay Zhan --- datafusion/expr-common/src/type_coercion/binary.rs | 8 +++++--- datafusion/sqllogictest/test_files/coalesce.slt | 8 ++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 31fe6a59baee..c32b4951db44 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -330,11 +330,13 @@ impl From<&DataType> for TypeCategory { return TypeCategory::Array; } - // String literal is possible to cast to many other types like numeric or datetime, - // therefore, it is categorized as a unknown type + // It is categorized as unknown type because the type will be resolved later on if matches!( data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Null + DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Utf8View + | DataType::Null ) { return TypeCategory::Unknown; } diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 97e77d0feb3d..06460a005c20 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -242,6 +242,14 @@ none_set statement ok drop table test1 +# coalesce with utf8view +query TTT +select coalesce(arrow_cast(null, 'Utf8View'), arrow_cast('t', 'Utf8')), + arrow_typeof(coalesce(arrow_cast(null, 'Utf8View'), arrow_cast('t', 'Utf8'))), + arrow_typeof(coalesce(arrow_cast(null, 'Utf8'), arrow_cast('t', 'Utf8View'))); +---- +t Utf8View Utf8View + # test dict coercion with value statement ok create table t(c varchar) as values ('a'), (null); From 2d86725d294d7f8bd832cff036b794808086400a Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Wed, 13 Nov 2024 19:02:46 -0500 Subject: [PATCH 05/17] fix docs (#13397) --- datafusion/functions-nested/src/string.rs | 14 ++++++--- .../source/user-guide/sql/scalar_functions.md | 30 ++++++++++++------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index 30f3845215fc..ce555c36274e 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -168,16 +168,16 @@ impl ScalarUDFImpl for ArrayToString { } } -static DOCUMENTATION: OnceLock = OnceLock::new(); +static DOCUMENTATION_ARRAY_TO_STRING: OnceLock = OnceLock::new(); fn get_array_to_string_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { + DOCUMENTATION_ARRAY_TO_STRING.get_or_init(|| { Documentation::builder() .with_doc_section(DOC_SECTION_ARRAY) .with_description( "Converts each element to its text representation.", ) - .with_syntax_example("array_to_string(array, delimiter)") + .with_syntax_example("array_to_string(array, delimiter[, null_string])") .with_sql_example( r#"```sql > select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ','); @@ -196,6 +196,10 @@ fn get_array_to_string_doc() -> &'static Documentation { "delimiter", "Array element separator.", ) + .with_argument( + "null_string", + "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior.", + ) .build() .unwrap() }) @@ -274,8 +278,10 @@ impl ScalarUDFImpl for StringToArray { } } +static DOCUMENTATION_STRING_TO_ARRAY: OnceLock = OnceLock::new(); + fn get_string_to_array_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { + DOCUMENTATION_STRING_TO_ARRAY.get_or_init(|| { Documentation::builder() .with_doc_section(DOC_SECTION_ARRAY) .with_description( diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 74e7285b76e9..a379dfc9ec29 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -3469,13 +3469,14 @@ array_sort(array, desc, nulls_first) Converts each element to its text representation. ``` -array_to_string(array, delimiter) +array_to_string(array, delimiter[, null_string]) ``` #### Arguments - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. - **delimiter**: Array element separator. +- **null_string**: Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior. #### Example @@ -3850,26 +3851,33 @@ range(start, stop, step) ### `string_to_array` -Converts each element to its text representation. +Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL. ``` -array_to_string(array, delimiter) +string_to_array(str, delimiter[, null_str]) ``` #### Arguments -- **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. -- **delimiter**: Array element separator. +- **str**: String expression to split. +- **delimiter**: Delimiter string to split on. +- **null_str**: Substring values to be replaced with `NULL`. #### Example ```sql -> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ','); -+----------------------------------------------------+ -| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) | -+----------------------------------------------------+ -| 1,2,3,4,5,6,7,8 | -+----------------------------------------------------+ +> select string_to_array('abc##def', '##'); ++-----------------------------------+ +| string_to_array(Utf8('abc##def')) | ++-----------------------------------+ +| ['abc', 'def'] | ++-----------------------------------+ +> select string_to_array('abc def', ' ', 'def'); ++---------------------------------------------+ +| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) | ++---------------------------------------------+ +| ['abc', NULL] | ++---------------------------------------------+ ``` #### Aliases From 1d1f3534b4ef790d376ed6fde69a3b404c8c988d Mon Sep 17 00:00:00 2001 From: kamille Date: Thu, 14 Nov 2024 08:05:17 +0800 Subject: [PATCH 06/17] refactor: Consolidate single group by column code into sub modules (#13392) * sort out codes of single column group by. * sort out codes. * move row to suitable place, and improve comments. * fix doc. --- .../src/aggregates/group_values/mod.rs | 43 +++++++++++++------ .../{multi_column => multi_group_by}/bytes.rs | 4 +- .../bytes_view.rs | 4 +- .../{multi_column => multi_group_by}/mod.rs | 8 +++- .../primitive.rs | 4 +- .../{ => single_group_by}/bytes.rs | 0 .../{ => single_group_by}/bytes_view.rs | 0 .../group_values/single_group_by/mod.rs | 22 ++++++++++ .../{ => single_group_by}/primitive.rs | 0 .../physical-plan/src/aggregates/mod.rs | 2 +- .../src/aggregates/topk/hash_table.rs | 2 +- 11 files changed, 66 insertions(+), 23 deletions(-) rename datafusion/physical-plan/src/aggregates/group_values/{multi_column => multi_group_by}/bytes.rs (99%) rename datafusion/physical-plan/src/aggregates/group_values/{multi_column => multi_group_by}/bytes_view.rs (99%) rename datafusion/physical-plan/src/aggregates/group_values/{multi_column => multi_group_by}/mod.rs (99%) rename datafusion/physical-plan/src/aggregates/group_values/{multi_column => multi_group_by}/primitive.rs (98%) rename datafusion/physical-plan/src/aggregates/group_values/{ => single_group_by}/bytes.rs (100%) rename datafusion/physical-plan/src/aggregates/group_values/{ => single_group_by}/bytes_view.rs (100%) create mode 100644 datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs rename datafusion/physical-plan/src/aggregates/group_values/{ => single_group_by}/primitive.rs (100%) diff --git a/datafusion/physical-plan/src/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs index 12ed25a0ea34..a816203b6812 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/mod.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/mod.rs @@ -20,24 +20,27 @@ use arrow::record_batch::RecordBatch; use arrow_array::{downcast_primitive, ArrayRef}; use arrow_schema::{DataType, SchemaRef}; -use bytes_view::GroupValuesBytesView; use datafusion_common::Result; -pub(crate) mod primitive; use datafusion_expr::EmitTo; -use primitive::GroupValuesPrimitive; -mod multi_column; +pub(crate) mod multi_group_by; + mod row; -use multi_column::GroupValuesColumn; +mod single_group_by; +use datafusion_physical_expr::binary_map::OutputType; +use multi_group_by::GroupValuesColumn; use row::GroupValuesRows; -mod bytes; -mod bytes_view; -use bytes::GroupValuesByes; -use datafusion_physical_expr::binary_map::OutputType; +pub(crate) use single_group_by::primitive::HashValue; -use crate::aggregates::order::GroupOrdering; +use crate::aggregates::{ + group_values::single_group_by::{ + bytes::GroupValuesByes, bytes_view::GroupValuesBytesView, + primitive::GroupValuesPrimitive, + }, + order::GroupOrdering, +}; mod null_builder; @@ -77,7 +80,7 @@ mod null_builder; /// Each distinct group in a hash aggregation is identified by a unique group id /// (usize) which is assigned by instances of this trait. Group ids are /// continuous without gaps, starting from 0. -pub trait GroupValues: Send { +pub(crate) trait GroupValues: Send { /// Calculates the group id for each input row of `cols`, assigning new /// group ids as necessary. /// @@ -106,7 +109,21 @@ pub trait GroupValues: Send { } /// Return a specialized implementation of [`GroupValues`] for the given schema. -pub fn new_group_values( +/// +/// [`GroupValues`] implementations choosing logic: +/// +/// - If group by single column, and type of this column has +/// the specific [`GroupValues`] implementation, such implementation +/// will be chosen. +/// +/// - If group by multiple columns, and all column types have the specific +/// [`GroupColumn`] implementations, [`GroupValuesColumn`] will be chosen. +/// +/// - Otherwise, the general implementation [`GroupValuesRows`] will be chosen. +/// +/// [`GroupColumn`]: crate::aggregates::group_values::multi_group_by::GroupColumn +/// +pub(crate) fn new_group_values( schema: SchemaRef, group_ordering: &GroupOrdering, ) -> Result> { @@ -147,7 +164,7 @@ pub fn new_group_values( } } - if multi_column::supported_schema(schema.as_ref()) { + if multi_group_by::supported_schema(schema.as_ref()) { if matches!(group_ordering, GroupOrdering::None) { Ok(Box::new(GroupValuesColumn::::try_new(schema)?)) } else { diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_column/bytes.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs similarity index 99% rename from datafusion/physical-plan/src/aggregates/group_values/multi_column/bytes.rs rename to datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs index 820d28fc58e7..35a79cbd91ed 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/multi_column/bytes.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aggregates::group_values::multi_column::{nulls_equal_to, GroupColumn}; +use crate::aggregates::group_values::multi_group_by::{nulls_equal_to, GroupColumn}; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; use arrow::array::{AsArray, BufferBuilder, GenericBinaryArray, GenericStringArray}; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; @@ -403,7 +403,7 @@ where mod tests { use std::sync::Arc; - use crate::aggregates::group_values::multi_column::bytes::ByteGroupValueBuilder; + use crate::aggregates::group_values::multi_group_by::bytes::ByteGroupValueBuilder; use arrow_array::{ArrayRef, StringArray}; use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; use datafusion_physical_expr::binary_map::OutputType; diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_column/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs similarity index 99% rename from datafusion/physical-plan/src/aggregates/group_values/multi_column/bytes_view.rs rename to datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs index 032b4d9e2a91..811790f4e588 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/multi_column/bytes_view.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/bytes_view.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aggregates::group_values::multi_column::{nulls_equal_to, GroupColumn}; +use crate::aggregates::group_values::multi_group_by::{nulls_equal_to, GroupColumn}; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; use arrow::array::{make_view, AsArray, ByteView}; use arrow::buffer::ScalarBuffer; @@ -544,7 +544,7 @@ impl GroupColumn for ByteViewGroupValueBuilder { mod tests { use std::sync::Arc; - use crate::aggregates::group_values::multi_column::bytes_view::ByteViewGroupValueBuilder; + use crate::aggregates::group_values::multi_group_by::bytes_view::ByteViewGroupValueBuilder; use arrow::array::AsArray; use arrow::datatypes::StringViewType; use arrow_array::{ArrayRef, StringViewArray}; diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_column/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs similarity index 99% rename from datafusion/physical-plan/src/aggregates/group_values/multi_column/mod.rs rename to datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs index 191292c549f4..83b0f9d77369 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/multi_column/mod.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/mod.rs @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +//! `GroupValues` implementations for multi group by cases + mod bytes; mod bytes_view; mod primitive; use std::mem::{self, size_of}; -use crate::aggregates::group_values::multi_column::{ +use crate::aggregates::group_values::multi_group_by::{ bytes::ByteGroupValueBuilder, bytes_view::ByteViewGroupValueBuilder, primitive::PrimitiveGroupValueBuilder, }; @@ -1138,7 +1140,9 @@ mod tests { use datafusion_common::utils::proxy::RawTableAllocExt; use datafusion_expr::EmitTo; - use crate::aggregates::group_values::{multi_column::GroupValuesColumn, GroupValues}; + use crate::aggregates::group_values::{ + multi_group_by::GroupValuesColumn, GroupValues, + }; use super::GroupIndexView; diff --git a/datafusion/physical-plan/src/aggregates/group_values/multi_column/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs similarity index 98% rename from datafusion/physical-plan/src/aggregates/group_values/multi_column/primitive.rs rename to datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs index dff85ff7eb1a..4da482247458 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/multi_column/primitive.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/multi_group_by/primitive.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::aggregates::group_values::multi_column::{nulls_equal_to, GroupColumn}; +use crate::aggregates::group_values::multi_group_by::{nulls_equal_to, GroupColumn}; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; use arrow::buffer::ScalarBuffer; use arrow_array::cast::AsArray; @@ -208,7 +208,7 @@ impl GroupColumn mod tests { use std::sync::Arc; - use crate::aggregates::group_values::multi_column::primitive::PrimitiveGroupValueBuilder; + use crate::aggregates::group_values::multi_group_by::primitive::PrimitiveGroupValueBuilder; use arrow::datatypes::Int64Type; use arrow_array::{ArrayRef, Int64Array}; use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; diff --git a/datafusion/physical-plan/src/aggregates/group_values/bytes.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs similarity index 100% rename from datafusion/physical-plan/src/aggregates/group_values/bytes.rs rename to datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes.rs diff --git a/datafusion/physical-plan/src/aggregates/group_values/bytes_view.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs similarity index 100% rename from datafusion/physical-plan/src/aggregates/group_values/bytes_view.rs rename to datafusion/physical-plan/src/aggregates/group_values/single_group_by/bytes_view.rs diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs new file mode 100644 index 000000000000..417618ba66af --- /dev/null +++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/mod.rs @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! `GroupValues` implementations for single group by cases + +pub(crate) mod bytes; +pub(crate) mod bytes_view; +pub(crate) mod primitive; diff --git a/datafusion/physical-plan/src/aggregates/group_values/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs similarity index 100% rename from datafusion/physical-plan/src/aggregates/group_values/primitive.rs rename to datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 2220007fdd72..260c3a1c48de 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -52,7 +52,7 @@ use crate::execution_plan::CardinalityEffect; use datafusion_physical_expr::aggregate::AggregateFunctionExpr; use itertools::Itertools; -pub mod group_values; +pub(crate) mod group_values; mod no_grouping; pub mod order; mod row_hash; diff --git a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs index 34df643b6cf0..23a07ebec305 100644 --- a/datafusion/physical-plan/src/aggregates/topk/hash_table.rs +++ b/datafusion/physical-plan/src/aggregates/topk/hash_table.rs @@ -17,7 +17,7 @@ //! A wrapper around `hashbrown::RawTable` that allows entries to be tracked by index -use crate::aggregates::group_values::primitive::HashValue; +use crate::aggregates::group_values::HashValue; use crate::aggregates::topk::heap::Comparable; use ahash::RandomState; use arrow::datatypes::i256; From 37018a9def018d2ad06dd73470ab6e47cabf4a14 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Nov 2024 21:01:07 -0500 Subject: [PATCH 07/17] Fix typo in concepts-readings-events.md (#13400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pointed out on Discord by @djanderson 🙏 --- docs/source/user-guide/concepts-readings-events.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user-guide/concepts-readings-events.md b/docs/source/user-guide/concepts-readings-events.md index e9c8155ac13b..092f8433d47b 100644 --- a/docs/source/user-guide/concepts-readings-events.md +++ b/docs/source/user-guide/concepts-readings-events.md @@ -36,9 +36,9 @@ This is a list of DataFusion related blog posts, articles, and other resources. Please open a PR to add any new resources you create or find -- **0204-09-13** [Blog: Using StringView / German Style Strings to make Queries Faster: Part 2 - String Operations](https://www.influxdata.com/blog/faster-queries-with-stringview-part-two-influxdb/) [Reposted on DataFusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-2/) +- **2024-09-13** [Blog: Using StringView / German Style Strings to make Queries Faster: Part 2 - String Operations](https://www.influxdata.com/blog/faster-queries-with-stringview-part-two-influxdb/) [Reposted on DataFusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-2/) -- **0204-09-13** [Blog: Using StringView / German Style Strings to Make Queries Faster: Part 1- Reading Parquet](https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/) [Reposted on Datafusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/) +- **2024-09-13** [Blog: Using StringView / German Style Strings to Make Queries Faster: Part 1- Reading Parquet](https://www.influxdata.com/blog/faster-queries-with-stringview-part-one-influxdb/) [Reposted on Datafusion Blog](https://datafusion.apache.org/blog/2024/09/13/string-view-german-style-strings-part-1/) - **2024-10-16** [Blog: Candle Image Segmentation](https://www.letsql.com/posts/candle-image-segmentation/) From 000288ce3ec4cb11803c5b37680d6cddfa01c57e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 Nov 2024 10:32:46 +0800 Subject: [PATCH 08/17] Update sqlparser requirement from 0.51.0 to 0.52.0 (#13373) * Update sqlparser requirement from 0.51.0 to 0.52.0 Updates the requirements on [sqlparser](https://github.com/apache/datafusion-sqlparser-rs) to permit the latest version. - [Changelog](https://github.com/apache/datafusion-sqlparser-rs/blob/main/CHANGELOG.md) - [Commits](https://github.com/apache/datafusion-sqlparser-rs/compare/v0.51.0...v0.51.0) --- updated-dependencies: - dependency-name: sqlparser dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update for API changes * Update for better error messages * fix comment * cleanups * Update datafusion-cli cargo.locl --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Andrew Lamb --- Cargo.toml | 2 +- datafusion-cli/Cargo.lock | 148 +++++++++++------- datafusion/sql/src/expr/mod.rs | 11 ++ datafusion/sql/src/parser.rs | 4 +- datafusion/sql/src/statement.rs | 14 +- datafusion/sql/src/unparser/ast.rs | 1 + datafusion/sql/src/unparser/expr.rs | 1 + .../sqllogictest/test_files/distinct_on.slt | 2 +- datafusion/sqllogictest/test_files/select.slt | 4 +- datafusion/sqllogictest/test_files/unnest.slt | 2 +- 10 files changed, 123 insertions(+), 66 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0b5c74e15d13..001153915632 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -145,7 +145,7 @@ recursive = "0.1.1" regex = "1.8" rstest = "0.23.0" serde_json = "1" -sqlparser = { version = "0.51.0", features = ["visitor"] } +sqlparser = { version = "0.52.0", features = ["visitor"] } tempfile = "3" tokio = { version = "1.36", features = ["macros", "rt", "sync"] } url = "2.2" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 02bd01a49905..bfd0411798c9 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -63,9 +63,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" [[package]] name = "android-tzdata" @@ -152,7 +152,7 @@ dependencies = [ "snap", "strum 0.25.0", "strum_macros 0.25.3", - "thiserror", + "thiserror 1.0.69", "typed-builder", "uuid", "xz2", @@ -456,9 +456,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.9" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d6448cfb224dd6a9b9ac734f58622dd0d4751f3589f3b777345745f46b2eb14" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" dependencies = [ "aws-credential-types", "aws-runtime", @@ -523,9 +523,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded855583fa1d22e88fe39fd6062b062376e50a8211989e07cf5e38d52eb3453" +checksum = "09677244a9da92172c8dc60109b4a9658597d4d298b188dd0018b6a66b410ca4" dependencies = [ "aws-credential-types", "aws-runtime", @@ -545,9 +545,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.49.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9177ea1192e6601ae16c7273385690d88a7ed386a00b74a6bc894d12103cd933" +checksum = "81fea2f3a8bb3bd10932ae7ad59cc59f65f270fc9183a7e91f501dc5efbef7ee" dependencies = [ "aws-credential-types", "aws-runtime", @@ -567,9 +567,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.48.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "823ef553cf36713c97453e2ddff1eb8f62be7f4523544e2a5db64caf80100f0a" +checksum = "53dcf5e7d9bd1517b8b998e170e650047cea8a2b85fe1835abe3210713e541b7" dependencies = [ "aws-credential-types", "aws-runtime", @@ -917,9 +917,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.35" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f57c4b4da2a9d619dd035f27316d7a426305b75be93d09e92f2b9229c34feaf" +checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8" dependencies = [ "jobserver", "libc", @@ -1035,13 +1035,13 @@ checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "comfy-table" -version = "7.1.1" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ "strum 0.26.3", "strum_macros 0.26.4", - "unicode-width", + "unicode-width 0.2.0", ] [[package]] @@ -1097,9 +1097,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "0ca741a962e1b0bff6d724a1a0958b686406e853bb14061f218562e1896f95e6" dependencies = [ "libc", ] @@ -1137,9 +1137,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -1716,9 +1716,9 @@ checksum = "a5d9305ccc6942a704f4335694ecd3de2ea531b114ac2d51f5f843750787a92f" [[package]] name = "fastrand" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "fd-lock" @@ -1893,8 +1893,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -2482,9 +2484,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.161" +version = "0.2.162" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" +checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" [[package]] name = "libflate" @@ -3037,18 +3039,18 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa37f80ca58604976033fae9515a8a2989fc13797d953f7c04fb8fa36a11f205" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" dependencies = [ "cc", ] [[package]] name = "quad-rand" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" @@ -3062,9 +3064,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" dependencies = [ "bytes", "pin-project-lite", @@ -3073,26 +3075,29 @@ dependencies = [ "rustc-hash", "rustls 0.23.16", "socket2", - "thiserror", + "thiserror 2.0.3", "tokio", "tracing", ] [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", + "getrandom", "rand", "ring", "rustc-hash", "rustls 0.23.16", + "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.3", "tinyvec", "tracing", + "web-time", ] [[package]] @@ -3195,7 +3200,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -3212,9 +3217,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -3358,9 +3363,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.39" +version = "0.38.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375116bee2be9ed569afe2154ea6a99dfdffd257f533f187498c2a8f5feaf4ee" +checksum = "99e4ea3e1cdc4b559b8e5650f9c8e5998e3e5c1343b4eaf034565f32318d63c0" dependencies = [ "bitflags 2.6.0", "errno", @@ -3443,6 +3448,9 @@ name = "rustls-pki-types" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +dependencies = [ + "web-time", +] [[package]] name = "rustls-webpki" @@ -3488,7 +3496,7 @@ dependencies = [ "nix", "radix_trie", "unicode-segmentation", - "unicode-width", + "unicode-width 0.1.14", "utf8parse", "windows-sys 0.52.0", ] @@ -3548,9 +3556,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" dependencies = [ "core-foundation-sys", "libc", @@ -3570,18 +3578,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", @@ -3704,9 +3712,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.51.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7" +checksum = "9a875d8cd437cc8a97e9aeaeea352ec9a19aea99c23e9effb17757291de80b08" dependencies = [ "log", "sqlparser_derive", @@ -3834,9 +3842,9 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", "fastrand", @@ -3853,18 +3861,38 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "thiserror" -version = "1.0.68" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl 2.0.3", ] [[package]] name = "thiserror-impl" -version = "1.0.68" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" dependencies = [ "proc-macro2", "quote", @@ -3948,9 +3976,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.0" +version = "1.41.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" dependencies = [ "backtrace", "bytes", @@ -4123,6 +4151,12 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "untrusted" version = "0.9.0" diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 72f88abcea99..8c8d716a6665 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -367,6 +367,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { expr, pattern, escape_char, + any, } => self.sql_like_to_expr( negated, *expr, @@ -375,6 +376,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { schema, planner_context, false, + any, ), SQLExpr::ILike { @@ -382,6 +384,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { expr, pattern, escape_char, + any, } => self.sql_like_to_expr( negated, *expr, @@ -390,6 +393,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { schema, planner_context, true, + any, ), SQLExpr::SimilarTo { @@ -532,6 +536,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { left, compare_op, right, + // ANY/SOME are equivalent, this field specifies which the user + // specified but it doesn't affect the plan so ignore the field + is_some: _, } => { let mut binary_expr = RawBinaryExpr { op: compare_op, @@ -779,7 +786,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { schema: &DFSchema, planner_context: &mut PlannerContext, case_insensitive: bool, + any: bool, ) -> Result { + if any { + return not_impl_err!("ANY in LIKE expression"); + } let pattern = self.sql_expr_to_logical_expr(pattern, schema, planner_context)?; let pattern_type = pattern.get_type(schema)?; if pattern_type != DataType::Utf8 && pattern_type != DataType::Null { diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 8a984f1645e9..bd1ed3145ef5 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -141,7 +141,7 @@ pub enum CopyToSource { /// `COPY TO ...` Relation(ObjectName), /// COPY (...query...) TO ... - Query(Query), + Query(Box), } impl fmt::Display for CopyToSource { @@ -1444,7 +1444,7 @@ mod tests { }; let query = if let SQLStatement::Query(query) = statement { - *query + query } else { panic!("Expected query, got {statement:?}"); }; diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 0ac804b706c8..31b836f32b24 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -152,6 +152,10 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec {} } } @@ -646,6 +650,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { name, parameters, using, + // has_parentheses specifies the syntax, but the plan is the + // same no matter the synax used, so ignore it + has_parentheses: _, } => { // `USING` is a MySQL-specific syntax and currently not supported. if !using.is_empty() { @@ -661,7 +668,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .collect::>>()?; Ok(LogicalPlan::Statement(PlanStatement::Execute(Execute { - name: ident_to_string(&name), + name: object_name_to_string(&name), parameters, }))) } @@ -680,6 +687,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { full, db_name, filter, + // SHOW TABLES IN/FROM are equivalent, this field specifies which the user + // specified, but it doesn't affect the plan so ignore the field + clause: _, } => self.show_tables_to_plan(extended, full, db_name, filter), Statement::ShowColumns { @@ -1108,7 +1118,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { (plan, input_schema, Some(table_ref)) } CopyToSource::Query(query) => { - let plan = self.query_to_plan(query, &mut PlannerContext::new())?; + let plan = self.query_to_plan(*query, &mut PlannerContext::new())?; let input_schema = Arc::clone(plan.schema()); (plan, input_schema, None) } diff --git a/datafusion/sql/src/unparser/ast.rs b/datafusion/sql/src/unparser/ast.rs index 2de1ce9125a7..cc0812cd71e1 100644 --- a/datafusion/sql/src/unparser/ast.rs +++ b/datafusion/sql/src/unparser/ast.rs @@ -241,6 +241,7 @@ impl SelectBuilder { pub fn build(&self) -> Result { Ok(ast::Select { distinct: self.distinct.clone(), + top_before_distinct: false, top: self.top.clone(), projection: self.projection.clone(), into: self.into.clone(), diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 0678e7d0306c..8f6ffa51f76a 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -282,6 +282,7 @@ impl Unparser<'_> { expr: Box::new(self.expr_to_sql_inner(expr)?), pattern: Box::new(self.expr_to_sql_inner(pattern)?), escape_char: escape_char.map(|c| c.to_string()), + any: false, }), Expr::AggregateFunction(agg) => { let func_name = agg.func.name(); diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index 604ac95ff476..cc0ebf83a843 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -153,7 +153,7 @@ b 1 29 -18218 994303988 5983957848665088916 204 9489 3275293996 1485709125918647 c 2 1 18109 2033001162 -6513304855495910254 25 43062 1491205016 5863949479783605708 0.110830784 0.929409733247 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW # can't distinct on * -query error DataFusion error: SQL error: ParserError\("Expected: an expression:, found: \*"\) +query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \*"\) SELECT DISTINCT ON (*) c1 FROM aggregate_test_100 ORDER BY c1 LIMIT 3; diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index c096f6e692af..c687429ae6ec 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -339,10 +339,10 @@ NULL 1 statement error DataFusion error: SQL error: ParserError\("Expected: \(, found: EOF"\) VALUES -statement error DataFusion error: SQL error: ParserError\("Expected: an expression:, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) VALUES () -statement error DataFusion error: SQL error: ParserError\("Expected: an expression:, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) VALUES (1),() statement error DataFusion error: Error during planning: Inconsistent data length across values list: got 2 values in row 1 but expected 1 diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 947eb8630b52..8ebed5b25ca9 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -295,7 +295,7 @@ query error DataFusion error: Error during planning: unnest\(\) requires exactly select unnest(); ## Unnest empty expression in from clause -query error DataFusion error: SQL error: ParserError\("Expected: an expression:, found: \)"\) +query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) select * from unnest(); From f35ab7583f1113e5bb97e03f973c67a2eda83cc8 Mon Sep 17 00:00:00 2001 From: niebayes Date: Thu, 14 Nov 2024 11:02:19 +0800 Subject: [PATCH 09/17] chore: expose TypeCoercionRewriter::new and add examples for type coerce expressions (#13387) * chore: make TypeCoercionRewriter::new public * docs: add docs for type coerce expressions * docs: update datafusion-sql readme * docs: update type coercion demo * Apply suggestions from code review Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- datafusion-examples/README.md | 2 +- datafusion-examples/examples/expr_api.rs | 108 +++++++++++++++++- .../optimizer/src/analyzer/type_coercion.rs | 2 +- datafusion/sql/README.md | 4 +- 4 files changed, 111 insertions(+), 5 deletions(-) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 5f032c3e9cff..75fcaddf8def 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -61,7 +61,7 @@ cargo run --example dataframe - [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query using a DataFrame against data in memory - [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods which write data out from a DataFrame - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results into rust structs using serde -- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze `Expr`s +- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s - [`file_stream_provider.rs`](examples/file_stream_provider.rs): Run a query on `FileStreamProvider` which implements `StreamProvider` for reading and writing to arbitrary stream sources / sinks. - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients - [`function_factory.rs`](examples/function_factory.rs): Register `CREATE FUNCTION` handler to implement SQL macros diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 0eb823302acf..cb0796bdcf73 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -18,7 +18,7 @@ use std::collections::HashMap; use std::sync::Arc; -use arrow::array::{BooleanArray, Int32Array}; +use arrow::array::{BooleanArray, Int32Array, Int8Array}; use arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; @@ -28,12 +28,14 @@ use datafusion::functions_aggregate::first_last::first_value_udaf; use datafusion::optimizer::simplify_expressions::ExprSimplifier; use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries}; use datafusion::prelude::*; +use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::BinaryExpr; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ColumnarValue, ExprFunctionExt, ExprSchemable, Operator}; +use datafusion_optimizer::analyzer::type_coercion::TypeCoercionRewriter; /// This example demonstrates the DataFusion [`Expr`] API. /// @@ -51,6 +53,7 @@ use datafusion_expr::{ColumnarValue, ExprFunctionExt, ExprSchemable, Operator}; /// 4. Simplify expressions: [`simplify_demo`] /// 5. Analyze predicates for boundary ranges: [`range_analysis_demo`] /// 6. Get the types of the expressions: [`expression_type_demo`] +/// 7. Apply type cocercion to expressions: [`type_coercion_demo`] #[tokio::main] async fn main() -> Result<()> { // The easiest way to do create expressions is to use the @@ -80,6 +83,9 @@ async fn main() -> Result<()> { // See how to determine the data types of expressions expression_type_demo()?; + // See how to type coerce expressions. + type_coercion_demo()?; + Ok(()) } @@ -316,3 +322,103 @@ fn expression_type_demo() -> Result<()> { Ok(()) } + +/// This function demonstrates how to apply type coercion to expressions, such as binary expressions. +/// +/// In most cases, manual type coercion is not required since DataFusion handles it implicitly. +/// However, certain projects may construct `ExecutionPlan`s directly from DataFusion logical expressions, +/// bypassing the construction of DataFusion logical plans. +/// Since constructing `ExecutionPlan`s from logical expressions does not automatically apply type coercion, +/// you may need to handle type coercion manually in these cases. +/// +/// The codes in this function shows various ways to perform type coercion on expressions: +/// 1. Using `SessionContext::create_physical_expr` +/// 2. Using `ExprSimplifier::coerce` +/// 3. Using `TreeNodeRewriter::rewrite` based on `TypeCoercionRewriter` +/// 4. Using `TreeNode::transform` +/// +/// Note, this list may not be complete and there may be other methods to apply type coercion to expressions. +fn type_coercion_demo() -> Result<()> { + // Creates a record batch for demo. + let df_schema = DFSchema::from_unqualified_fields( + vec![Field::new("a", DataType::Int8, false)].into(), + HashMap::new(), + )?; + let i8_array = Int8Array::from_iter_values(vec![0, 1, 2]); + let batch = RecordBatch::try_new( + Arc::new(df_schema.as_arrow().to_owned()), + vec![Arc::new(i8_array) as _], + )?; + + // Constructs a binary expression for demo. + // By default, the literal `1` is translated into the Int32 type and cannot be directly compared with the Int8 type. + let expr = col("a").gt(lit(1)); + + // Evaluation with an expression that has not been type coerced cannot succeed. + let props = ExecutionProps::default(); + let physical_expr = + datafusion_physical_expr::create_physical_expr(&expr, &df_schema, &props)?; + let e = physical_expr.evaluate(&batch).unwrap_err(); + assert!(e + .find_root() + .to_string() + .contains("Invalid comparison operation: Int8 > Int32")); + + // 1. Type coercion with `SessionContext::create_physical_expr` which implicitly applies type coercion before constructing the physical expr. + let physical_expr = + SessionContext::new().create_physical_expr(expr.clone(), &df_schema)?; + assert!(physical_expr.evaluate(&batch).is_ok()); + + // 2. Type coercion with `ExprSimplifier::coerce`. + let context = SimplifyContext::new(&props).with_schema(Arc::new(df_schema.clone())); + let simplifier = ExprSimplifier::new(context); + let coerced_expr = simplifier.coerce(expr.clone(), &df_schema)?; + let physical_expr = datafusion_physical_expr::create_physical_expr( + &coerced_expr, + &df_schema, + &props, + )?; + assert!(physical_expr.evaluate(&batch).is_ok()); + + // 3. Type coercion with `TypeCoercionRewriter`. + let coerced_expr = expr + .clone() + .rewrite(&mut TypeCoercionRewriter::new(&df_schema))? + .data; + let physical_expr = datafusion_physical_expr::create_physical_expr( + &coerced_expr, + &df_schema, + &props, + )?; + assert!(physical_expr.evaluate(&batch).is_ok()); + + // 4. Apply explict type coercion by manually rewriting the expression + let coerced_expr = expr + .transform(|e| { + // Only type coerces binary expressions. + let Expr::BinaryExpr(e) = e else { + return Ok(Transformed::no(e)); + }; + if let Expr::Column(ref col_expr) = *e.left { + let field = df_schema.field_with_name(None, col_expr.name())?; + let cast_to_type = field.data_type(); + let coerced_right = e.right.cast_to(cast_to_type, &df_schema)?; + Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr::new( + e.left, + e.op, + Box::new(coerced_right), + )))) + } else { + Ok(Transformed::no(Expr::BinaryExpr(e))) + } + })? + .data; + let physical_expr = datafusion_physical_expr::create_physical_expr( + &coerced_expr, + &df_schema, + &props, + )?; + assert!(physical_expr.evaluate(&batch).is_ok()); + + Ok(()) +} diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 9793c4c5490f..b56c2dc604a9 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -158,7 +158,7 @@ pub struct TypeCoercionRewriter<'a> { impl<'a> TypeCoercionRewriter<'a> { /// Create a new [`TypeCoercionRewriter`] with a provided schema /// representing both the inputs and output of the [`LogicalPlan`] node. - fn new(schema: &'a DFSchema) -> Self { + pub fn new(schema: &'a DFSchema) -> Self { Self { schema } } diff --git a/datafusion/sql/README.md b/datafusion/sql/README.md index 02ed1ae1f58b..98f3c4faa2ec 100644 --- a/datafusion/sql/README.md +++ b/datafusion/sql/README.md @@ -52,8 +52,8 @@ fn main() { let statement = &ast[0]; // create a logical query plan - let schema_provider = MySchemaProvider::new(); - let sql_to_rel = SqlToRel::new(&schema_provider); + let context_provider = MyContextProvider::new(); + let sql_to_rel = SqlToRel::new(&context_provider); let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap(); // show the plan From 66180fa24ec889d92e771649738c2b2d362907df Mon Sep 17 00:00:00 2001 From: irenjj Date: Thu, 14 Nov 2024 19:05:43 +0800 Subject: [PATCH 10/17] Migrate code from invoke to invoke_batch. (#13345) * migrate UDF invoke to invoke_batch * fix --------- Co-authored-by: Andrew Lamb --- datafusion/functions/src/datetime/date_bin.rs | 281 +++++++++++------- .../functions/src/datetime/date_trunc.rs | 28 +- .../functions/src/datetime/make_date.rs | 102 ++++--- datafusion/functions/src/datetime/to_char.rs | 60 ++-- datafusion/functions/src/datetime/to_date.rs | 60 ++-- datafusion/functions/src/string/concat.rs | 3 +- datafusion/functions/src/string/concat_ws.rs | 6 +- datafusion/functions/src/string/contains.rs | 3 +- datafusion/functions/src/string/lower.rs | 4 +- datafusion/functions/src/string/upper.rs | 4 +- 10 files changed, 322 insertions(+), 229 deletions(-) diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index 82481f9fff64..671967a89325 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -504,7 +504,7 @@ mod tests { use crate::datetime::date_bin::{date_bin_nanos_interval, DateBinFunc}; use arrow::array::types::TimestampNanosecondType; - use arrow::array::{IntervalDayTimeArray, TimestampNanosecondArray}; + use arrow::array::{Array, IntervalDayTimeArray, TimestampNanosecondArray}; use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; use arrow::datatypes::{DataType, TimeUnit}; @@ -515,50 +515,68 @@ mod tests { use chrono::TimeDelta; #[test] - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch fn test_date_bin() { - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 1, - }))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 1, + }, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert!(res.is_ok()); let timestamps = Arc::new((1..6).map(Some).collect::()); - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 1, - }))), - ColumnarValue::Array(timestamps), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let batch_size = timestamps.len(); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 1, + }, + ))), + ColumnarValue::Array(timestamps), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + batch_size, + ); assert!(res.is_ok()); - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 1, - }))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 1, + }, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert!(res.is_ok()); // stride supports month-day-nano - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some( - IntervalMonthDayNano { - months: 0, - days: 0, - nanoseconds: 1, - }, - ))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some( + IntervalMonthDayNano { + months: 0, + days: 0, + nanoseconds: 1, + }, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert!(res.is_ok()); // @@ -566,99 +584,129 @@ mod tests { // // invalid number of arguments - let res = DateBinFunc::new().invoke(&[ColumnarValue::Scalar( - ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 1, - })), - )]); + let res = DateBinFunc::new().invoke_batch( + &[ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 1, + }, + )))], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Execution error: DATE_BIN expected two or three arguments" ); // stride: invalid type - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Execution error: DATE_BIN expects stride argument to be an INTERVAL but got Interval(YearMonth)" ); // stride: invalid value - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 0, - }))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 0, + }, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Execution error: DATE_BIN stride must be non-zero" ); // stride: overflow of day-time interval - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( - IntervalDayTime::MAX, - ))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime::MAX, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Execution error: DATE_BIN stride argument is too large" ); // stride: overflow of month-day-nano interval - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::new_interval_mdn(0, i32::MAX, 1)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::new_interval_mdn(0, i32::MAX, 1)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Execution error: DATE_BIN stride argument is too large" ); // stride: month intervals - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1, 1, 1)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::new_interval_mdn(1, 1, 1)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "This feature is not implemented: DATE_BIN stride does not support combination of month, day and nanosecond intervals" ); // origin: invalid type - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 1, - }))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 1, + }, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Execution error: DATE_BIN expects origin argument to be a TIMESTAMP with nanosecond precision but got Timestamp(Microsecond, None)" ); - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 1, - }))), - ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 1, + }, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert!(res.is_ok()); // unsupported array type for stride @@ -672,11 +720,15 @@ mod tests { }) .collect::(), ); - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Array(intervals), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let batch_size = intervals.len(); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Array(intervals), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + batch_size, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "This feature is not implemented: DATE_BIN only supports literal values for the stride argument, not arrays" @@ -684,14 +736,20 @@ mod tests { // unsupported array type for origin let timestamps = Arc::new((1..6).map(Some).collect::()); - let res = DateBinFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(IntervalDayTime { - days: 0, - milliseconds: 1, - }))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Array(timestamps), - ]); + let batch_size = timestamps.len(); + let res = DateBinFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some( + IntervalDayTime { + days: 0, + milliseconds: 1, + }, + ))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Array(timestamps), + ], + batch_size, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "This feature is not implemented: DATE_BIN only supports literal values for the origin argument, not arrays" @@ -806,16 +864,19 @@ mod tests { .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) .collect::() .with_timezone_opt(tz_opt.clone()); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch + let batch_size = input.len(); let result = DateBinFunc::new() - .invoke(&[ - ColumnarValue::Scalar(ScalarValue::new_interval_dt(1, 0)), - ColumnarValue::Array(Arc::new(input)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( - Some(string_to_timestamp_nanos(origin).unwrap()), - tz_opt.clone(), - )), - ]) + .invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::new_interval_dt(1, 0)), + ColumnarValue::Array(Arc::new(input)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( + Some(string_to_timestamp_nanos(origin).unwrap()), + tz_opt.clone(), + )), + ], + batch_size, + ) .unwrap(); if let ColumnarValue::Array(result) = result { assert_eq!( diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index f8abef601f70..5ec308ef9c81 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -484,7 +484,7 @@ mod tests { use arrow::array::cast::as_primitive_array; use arrow::array::types::TimestampNanosecondType; - use arrow::array::TimestampNanosecondArray; + use arrow::array::{Array, TimestampNanosecondArray}; use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; use arrow::datatypes::{DataType, TimeUnit}; use datafusion_common::ScalarValue; @@ -724,12 +724,15 @@ mod tests { .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) .collect::() .with_timezone_opt(tz_opt.clone()); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch + let batch_size = input.len(); let result = DateTruncFunc::new() - .invoke(&[ - ColumnarValue::Scalar(ScalarValue::from("day")), - ColumnarValue::Array(Arc::new(input)), - ]) + .invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::from("day")), + ColumnarValue::Array(Arc::new(input)), + ], + batch_size, + ) .unwrap(); if let ColumnarValue::Array(result) = result { assert_eq!( @@ -883,12 +886,15 @@ mod tests { .map(|s| Some(string_to_timestamp_nanos(s).unwrap())) .collect::() .with_timezone_opt(tz_opt.clone()); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch + let batch_size = input.len(); let result = DateTruncFunc::new() - .invoke(&[ - ColumnarValue::Scalar(ScalarValue::from("hour")), - ColumnarValue::Array(Arc::new(input)), - ]) + .invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::from("hour")), + ColumnarValue::Array(Arc::new(input)), + ], + batch_size, + ) .unwrap(); if let ColumnarValue::Array(result) = result { assert_eq!( diff --git a/datafusion/functions/src/datetime/make_date.rs b/datafusion/functions/src/datetime/make_date.rs index 6b246cb088a2..a13511f33398 100644 --- a/datafusion/functions/src/datetime/make_date.rs +++ b/datafusion/functions/src/datetime/make_date.rs @@ -234,13 +234,15 @@ mod tests { #[test] fn test_make_date() { - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let res = MakeDateFunc::new() - .invoke(&[ - ColumnarValue::Scalar(ScalarValue::Int32(Some(2024))), - ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), - ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), - ]) + .invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::Int32(Some(2024))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), + ], + 1, + ) .expect("that make_date parsed values without error"); if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { @@ -249,13 +251,15 @@ mod tests { panic!("Expected a scalar value") } - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let res = MakeDateFunc::new() - .invoke(&[ - ColumnarValue::Scalar(ScalarValue::Int64(Some(2024))), - ColumnarValue::Scalar(ScalarValue::UInt64(Some(1))), - ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), - ]) + .invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::Int64(Some(2024))), + ColumnarValue::Scalar(ScalarValue::UInt64(Some(1))), + ColumnarValue::Scalar(ScalarValue::UInt32(Some(14))), + ], + 1, + ) .expect("that make_date parsed values without error"); if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { @@ -264,13 +268,15 @@ mod tests { panic!("Expected a scalar value") } - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let res = MakeDateFunc::new() - .invoke(&[ - ColumnarValue::Scalar(ScalarValue::Utf8(Some("2024".to_string()))), - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("1".to_string()))), - ColumnarValue::Scalar(ScalarValue::Utf8(Some("14".to_string()))), - ]) + .invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::Utf8(Some("2024".to_string()))), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some("1".to_string()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("14".to_string()))), + ], + 1, + ) .expect("that make_date parsed values without error"); if let ColumnarValue::Scalar(ScalarValue::Date32(date)) = res { @@ -282,13 +288,16 @@ mod tests { let years = Arc::new((2021..2025).map(Some).collect::()); let months = Arc::new((1..5).map(Some).collect::()); let days = Arc::new((11..15).map(Some).collect::()); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch + let batch_size = years.len(); let res = MakeDateFunc::new() - .invoke(&[ - ColumnarValue::Array(years), - ColumnarValue::Array(months), - ColumnarValue::Array(days), - ]) + .invoke_batch( + &[ + ColumnarValue::Array(years), + ColumnarValue::Array(months), + ColumnarValue::Array(days), + ], + batch_size, + ) .expect("that make_date parsed values without error"); if let ColumnarValue::Array(array) = res { @@ -308,45 +317,50 @@ mod tests { // // invalid number of arguments - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let res = MakeDateFunc::new() - .invoke(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))]); + .invoke_batch(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))], 1); assert_eq!( res.err().unwrap().strip_backtrace(), "Execution error: make_date function requires 3 arguments, got 1" ); // invalid type - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let res = MakeDateFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let res = MakeDateFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::IntervalYearMonth(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Arrow error: Cast error: Casting from Interval(YearMonth) to Int32 not supported" ); // overflow of month - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let res = MakeDateFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), - ColumnarValue::Scalar(ScalarValue::UInt64(Some(u64::MAX))), - ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), - ]); + let res = MakeDateFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), + ColumnarValue::Scalar(ScalarValue::UInt64(Some(u64::MAX))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Arrow error: Cast error: Can't cast value 18446744073709551615 to type Int32" ); // overflow of day - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let res = MakeDateFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), - ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), - ColumnarValue::Scalar(ScalarValue::UInt32(Some(u32::MAX))), - ]); + let res = MakeDateFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::Int32(Some(2023))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(22))), + ColumnarValue::Scalar(ScalarValue::UInt32(Some(u32::MAX))), + ], + 1, + ); assert_eq!( res.err().unwrap().strip_backtrace(), "Arrow error: Cast error: Can't cast value 4294967295 to type Int32" diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index ef5d6a4f6990..dd4ae7b8464e 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -384,9 +384,11 @@ mod tests { ]; for (value, format, expected) in scalar_data { - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let result = ToCharFunc::new() - .invoke(&[ColumnarValue::Scalar(value), ColumnarValue::Scalar(format)]) + .invoke_batch( + &[ColumnarValue::Scalar(value), ColumnarValue::Scalar(format)], + 1, + ) .expect("that to_char parsed values without error"); if let ColumnarValue::Scalar(ScalarValue::Utf8(date)) = result { @@ -459,12 +461,15 @@ mod tests { ]; for (value, format, expected) in scalar_array_data { - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch + let batch_size = format.len(); let result = ToCharFunc::new() - .invoke(&[ - ColumnarValue::Scalar(value), - ColumnarValue::Array(Arc::new(format) as ArrayRef), - ]) + .invoke_batch( + &[ + ColumnarValue::Scalar(value), + ColumnarValue::Array(Arc::new(format) as ArrayRef), + ], + batch_size, + ) .expect("that to_char parsed values without error"); if let ColumnarValue::Scalar(ScalarValue::Utf8(date)) = result { @@ -585,12 +590,15 @@ mod tests { ]; for (value, format, expected) in array_scalar_data { - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch + let batch_size = value.len(); let result = ToCharFunc::new() - .invoke(&[ - ColumnarValue::Array(value as ArrayRef), - ColumnarValue::Scalar(format), - ]) + .invoke_batch( + &[ + ColumnarValue::Array(value as ArrayRef), + ColumnarValue::Scalar(format), + ], + batch_size, + ) .expect("that to_char parsed values without error"); if let ColumnarValue::Array(result) = result { @@ -602,12 +610,15 @@ mod tests { } for (value, format, expected) in array_array_data { - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch + let batch_size = value.len(); let result = ToCharFunc::new() - .invoke(&[ - ColumnarValue::Array(value), - ColumnarValue::Array(Arc::new(format) as ArrayRef), - ]) + .invoke_batch( + &[ + ColumnarValue::Array(value), + ColumnarValue::Array(Arc::new(format) as ArrayRef), + ], + batch_size, + ) .expect("that to_char parsed values without error"); if let ColumnarValue::Array(result) = result { @@ -623,20 +634,21 @@ mod tests { // // invalid number of arguments - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let result = ToCharFunc::new() - .invoke(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))]); + .invoke_batch(&[ColumnarValue::Scalar(ScalarValue::Int32(Some(1)))], 1); assert_eq!( result.err().unwrap().strip_backtrace(), "Execution error: to_char function requires 2 arguments, got 1" ); // invalid type - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = ToCharFunc::new().invoke(&[ - ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), - ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), - ]); + let result = ToCharFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), + ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(1), None)), + ], + 1, + ); assert_eq!( result.err().unwrap().strip_backtrace(), "Execution error: Format for `to_char` must be non-null Utf8, received Timestamp(Nanosecond, None)" diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 8f72100416e8..ff322ce31960 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -213,8 +213,8 @@ mod tests { } fn test_scalar(sv: ScalarValue, tc: &TestCase) { - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let to_date_result = ToDateFunc::new().invoke(&[ColumnarValue::Scalar(sv)]); + let to_date_result = + ToDateFunc::new().invoke_batch(&[ColumnarValue::Scalar(sv)], 1); match to_date_result { Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { @@ -234,9 +234,9 @@ mod tests { A: From> + Array + 'static, { let date_array = A::from(vec![tc.date_str]); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let to_date_result = - ToDateFunc::new().invoke(&[ColumnarValue::Array(Arc::new(date_array))]); + let batch_size = date_array.len(); + let to_date_result = ToDateFunc::new() + .invoke_batch(&[ColumnarValue::Array(Arc::new(date_array))], batch_size); match to_date_result { Ok(ColumnarValue::Array(a)) => { @@ -325,11 +325,13 @@ mod tests { fn test_scalar(sv: ScalarValue, tc: &TestCase) { let format_scalar = ScalarValue::Utf8(Some(tc.format_str.to_string())); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let to_date_result = ToDateFunc::new().invoke(&[ - ColumnarValue::Scalar(sv), - ColumnarValue::Scalar(format_scalar), - ]); + let to_date_result = ToDateFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(sv), + ColumnarValue::Scalar(format_scalar), + ], + 1, + ); match to_date_result { Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { @@ -350,11 +352,14 @@ mod tests { let date_array = A::from(vec![tc.formatted_date]); let format_array = A::from(vec![tc.format_str]); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let to_date_result = ToDateFunc::new().invoke(&[ - ColumnarValue::Array(Arc::new(date_array)), - ColumnarValue::Array(Arc::new(format_array)), - ]); + let batch_size = date_array.len(); + let to_date_result = ToDateFunc::new().invoke_batch( + &[ + ColumnarValue::Array(Arc::new(date_array)), + ColumnarValue::Array(Arc::new(format_array)), + ], + batch_size, + ); match to_date_result { Ok(ColumnarValue::Array(a)) => { @@ -386,12 +391,14 @@ mod tests { let format1_scalar = ScalarValue::Utf8(Some("%Y-%m-%d".into())); let format2_scalar = ScalarValue::Utf8(Some("%Y/%m/%d".into())); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let to_date_result = ToDateFunc::new().invoke(&[ - ColumnarValue::Scalar(formatted_date_scalar), - ColumnarValue::Scalar(format1_scalar), - ColumnarValue::Scalar(format2_scalar), - ]); + let to_date_result = ToDateFunc::new().invoke_batch( + &[ + ColumnarValue::Scalar(formatted_date_scalar), + ColumnarValue::Scalar(format1_scalar), + ColumnarValue::Scalar(format2_scalar), + ], + 1, + ); match to_date_result { Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { @@ -415,9 +422,8 @@ mod tests { for date_str in test_cases { let formatted_date_scalar = ScalarValue::Utf8(Some(date_str.into())); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let to_date_result = - ToDateFunc::new().invoke(&[ColumnarValue::Scalar(formatted_date_scalar)]); + let to_date_result = ToDateFunc::new() + .invoke_batch(&[ColumnarValue::Scalar(formatted_date_scalar)], 1); match to_date_result { Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { @@ -434,9 +440,8 @@ mod tests { let date_str = "20241231"; let date_scalar = ScalarValue::Utf8(Some(date_str.into())); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let to_date_result = - ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]); + ToDateFunc::new().invoke_batch(&[ColumnarValue::Scalar(date_scalar)], 1); match to_date_result { Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { @@ -456,9 +461,8 @@ mod tests { let date_str = "202412311"; let date_scalar = ScalarValue::Utf8(Some(date_str.into())); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch let to_date_result = - ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]); + ToDateFunc::new().invoke_batch(&[ColumnarValue::Scalar(date_scalar)], 1); if let Ok(ColumnarValue::Scalar(ScalarValue::Date32(_))) = to_date_result { panic!( diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index e429a938b27d..f1e60004ddd0 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -408,8 +408,7 @@ mod tests { ]))); let args = &[c0, c1, c2]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = ConcatFunc::new().invoke(args)?; + let result = ConcatFunc::new().invoke_batch(args, 3)?; let expected = Arc::new(StringArray::from(vec!["foo,x", "bar,", "baz,z"])) as ArrayRef; match &result { diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index 611c48a9634a..98a75f121c35 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -467,8 +467,7 @@ mod tests { ]))); let args = &[c0, c1, c2]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = ConcatWsFunc::new().invoke(args)?; + let result = ConcatWsFunc::new().invoke_batch(args, 3)?; let expected = Arc::new(StringArray::from(vec!["foo,x", "bar", "baz,z"])) as ArrayRef; match &result { @@ -493,8 +492,7 @@ mod tests { ]))); let args = &[c0, c1, c2]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = ConcatWsFunc::new().invoke(args)?; + let result = ConcatWsFunc::new().invoke_batch(args, 3)?; let expected = Arc::new(StringArray::from(vec![Some("foo,x"), None, Some("baz+z")])) as ArrayRef; diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index 0c665a139152..3acd2464524d 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -145,8 +145,7 @@ mod test { Some("yyy?()"), ]))); let scalar = ColumnarValue::Scalar(ScalarValue::Utf8(Some("x?(".to_string()))); - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let actual = udf.invoke(&[array, scalar]).unwrap(); + let actual = udf.invoke_batch(&[array, scalar], 2).unwrap(); let expect = ColumnarValue::Array(Arc::new(BooleanArray::from(vec![ Some(true), Some(false), diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index 02770e5e2203..78887fde0a8e 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -104,9 +104,9 @@ mod tests { fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> { let func = LowerFunc::new(); + let batch_size = input.len(); let args = vec![ColumnarValue::Array(input)]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = match func.invoke(&args)? { + let result = match func.invoke_batch(&args, batch_size)? { ColumnarValue::Array(result) => result, _ => unreachable!("lower"), }; diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index 1293e51fa994..5039d094f2d6 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -104,9 +104,9 @@ mod tests { fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> { let func = UpperFunc::new(); + let batch_size = input.len(); let args = vec![ColumnarValue::Array(input)]; - #[allow(deprecated)] // TODO migrate UDF invoke to invoke_batch - let result = match func.invoke(&args)? { + let result = match func.invoke_batch(&args, batch_size)? { ColumnarValue::Array(result) => result, _ => unreachable!("upper"), }; From a5d0563f53d05f5589df83d163c91910f51020ba Mon Sep 17 00:00:00 2001 From: Jax Liu Date: Thu, 14 Nov 2024 21:19:00 +0800 Subject: [PATCH 11/17] Improve the coverage of `bit_length` testing (#13336) * improve the testing coverage of bit_length * fix the issue links --- .../test_files/string/dictionary_utf8.slt | 16 ++++++++++++++++ .../test_files/string/large_string.slt | 16 ++++++++++++++++ .../sqllogictest/test_files/string/string.slt | 16 ++++++++++++++++ .../test_files/string/string_view.slt | 9 +++++---- 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt index d0039f8dcd32..c16cfc2ca38e 100644 --- a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt +++ b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt @@ -64,6 +64,22 @@ _ (empty) false false false false NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +# TODO: move it back to `string_query.slt.part` after fixing the issue +# see issue https://github.com/apache/datafusion/issues/13329 +query IIII +select bit_length(ascii_1), bit_length(ascii_2), bit_length(unicode_1), bit_length(unicode_2) from test_basic_operator; +---- +48 8 144 32 +72 72 176 176 +56 8 240 64 +88 88 104 256 +56 24 216 288 +0 8 0 0 +8 16 0 0 +8 16 0 0 +NULL 8 NULL NULL +NULL 8 NULL 32 + # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt b/datafusion/sqllogictest/test_files/string/large_string.slt index 5b738d0041e4..35fc5694df6f 100644 --- a/datafusion/sqllogictest/test_files/string/large_string.slt +++ b/datafusion/sqllogictest/test_files/string/large_string.slt @@ -71,6 +71,22 @@ _ (empty) false false false false NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +# TODO: move it back to `string_query.slt.part` after fixing the issue +# see issue https://github.com/apache/datafusion/issues/13329 +query IIII +select bit_length(ascii_1), bit_length(ascii_2), bit_length(unicode_1), bit_length(unicode_2) from test_basic_operator; +---- +48 8 144 32 +72 72 176 176 +56 8 240 64 +88 88 104 256 +56 24 216 288 +0 8 0 0 +8 16 0 0 +8 16 0 0 +NULL 8 NULL NULL +NULL 8 NULL 32 + # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 4e173de84f48..db2b67d66333 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -187,6 +187,22 @@ _ \_ (empty) (empty) true false false false percent p%t pan Tadeusz ma iść w kąt Pan Tadeusz ma frunąć stąd w kąt true false true true under_score un_____core un iść core chrząszcz na łące w 東京都 true false true false +# TODO: move it back to `string_query.slt.part` after fixing the issue +# see issue https://github.com/apache/datafusion/issues/13329 +query IIII +select bit_length(ascii_1), bit_length(ascii_2), bit_length(unicode_1), bit_length(unicode_2) from test_basic_operator; +---- +48 8 144 32 +72 72 176 176 +56 8 240 64 +88 88 104 256 +56 24 216 288 +0 8 0 0 +8 16 0 0 +8 16 0 0 +NULL 8 NULL NULL +NULL 8 NULL 32 + # # Clean up # diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 2f4af80a9257..2b44c86f52d8 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -37,6 +37,11 @@ select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base; statement ok drop table test_source +# TODO: Revisit this issue after upgrading to the arrow-rs version that includes apache/arrow-rs#6671. +# see issue https://github.com/apache/datafusion/issues/13329 +query error DataFusion error: Arrow error: Compute error: bit_length not supported for Utf8View +select bit_length(ascii_1), bit_length(ascii_2), bit_length(unicode_1), bit_length(unicode_2) from test_basic_operator; + # # common test for string-like functions and operators # @@ -93,10 +98,6 @@ select octet_length(column1_utf8view) from test; 0 NULL -# TODO: Revisit this issue after upgrading to the arrow-rs version that includes apache/arrow-rs#6671. -query error DataFusion error: Arrow error: Compute error: bit_length not supported for Utf8View -select bit_length(column1_utf8view) from test; - query T select btrim(column1_large_utf8) from test; ---- From e7f7a9b159d927f2b4ef335efe6f3d11f3f4071c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Nov 2024 13:56:37 -0500 Subject: [PATCH 12/17] Consolidate dataframe example (#13410) * Consolidate dataframe example * update readme * Update datafusion-examples/examples/dataframe.rs * Update datafusion-examples/README.md --------- Co-authored-by: Oleks V --- datafusion-examples/README.md | 3 +- datafusion-examples/examples/dataframe.rs | 146 ++++++++++-------- .../examples/dataframe_in_memory.rs | 60 ------- 3 files changed, 86 insertions(+), 123 deletions(-) delete mode 100644 datafusion-examples/examples/dataframe_in_memory.rs diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 75fcaddf8def..528e7dd857e5 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -57,8 +57,7 @@ cargo run --example dataframe - [`custom_datasource.rs`](examples/custom_datasource.rs): Run queries against a custom datasource (TableProvider) - [`custom_file_format.rs`](examples/custom_file_format.rs): Write data to a custom file format - [`dataframe-to-s3.rs`](examples/external_dependency/dataframe-to-s3.rs): Run a query using a DataFrame against a parquet file from s3 and writing back to s3 -- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame against a local parquet file -- [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query using a DataFrame against data in memory +- [`dataframe.rs`](examples/dataframe.rs): Run a query using a DataFrame API against parquet files, csv files, and in-memory data - [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods which write data out from a DataFrame - [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results into rust structs using serde - [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify, analyze and coerce `Expr`s diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs index d7e0068ef88f..59766e881e8b 100644 --- a/datafusion-examples/examples/dataframe.rs +++ b/datafusion-examples/examples/dataframe.rs @@ -15,90 +15,82 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray}; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::error::Result; use datafusion::prelude::*; use std::fs::File; use std::io::Write; +use std::sync::Arc; use tempfile::tempdir; -/// This example demonstrates executing a simple query against an Arrow data source (Parquet) and -/// fetching results, using the DataFrame trait +/// This example demonstrates using DataFusion's DataFrame API to +/// +/// * [read_parquet]: execute queries against parquet files +/// * [read_csv]: execute queries against csv files +/// * [read_memory]: execute queries against in-memory arrow data #[tokio::main] async fn main() -> Result<()> { - // create local execution context + // The SessionContext is the main high level API for interacting with DataFusion let ctx = SessionContext::new(); + read_parquet(&ctx).await?; + read_csv(&ctx).await?; + read_memory(&ctx).await?; + Ok(()) +} +/// Use DataFrame API to +/// 1. Read parquet files, +/// 2. Show the schema +/// 3. Select columns and rows +async fn read_parquet(ctx: &SessionContext) -> Result<()> { + // Find the local path of "alltypes_plain.parquet" let testdata = datafusion::test_util::parquet_test_data(); - let filename = &format!("{testdata}/alltypes_plain.parquet"); - // define the query using the DataFrame trait - let df = ctx - .read_parquet(filename, ParquetReadOptions::default()) - .await? - .select_columns(&["id", "bool_col", "timestamp_col"])? - .filter(col("id").gt(lit(1)))?; - - // print the results - df.show().await?; - - // create a csv file waiting to be written - let dir = tempdir()?; - let file_path = dir.path().join("example.csv"); - let file = File::create(&file_path)?; - write_csv_file(file); - - // Reading CSV file with inferred schema example - let csv_df = - example_read_csv_file_with_inferred_schema(file_path.to_str().unwrap()).await; - csv_df.show().await?; - - // Reading CSV file with defined schema - let csv_df = example_read_csv_file_with_schema(file_path.to_str().unwrap()).await; - csv_df.show().await?; - - // Reading PARQUET file and print describe + // Read the parquet files and show its schema using 'describe' let parquet_df = ctx .read_parquet(filename, ParquetReadOptions::default()) .await?; - parquet_df.describe().await.unwrap().show().await?; - let dyn_ctx = ctx.enable_url_table(); - let df = dyn_ctx - .sql(&format!("SELECT * FROM '{}'", file_path.to_str().unwrap())) + // show its schema using 'describe' + parquet_df.clone().describe().await?.show().await?; + + // Select three columns and filter the results + // so that only rows where id > 1 are returned + parquet_df + .select_columns(&["id", "bool_col", "timestamp_col"])? + .filter(col("id").gt(lit(1)))? + .show() .await?; - df.show().await?; Ok(()) } -// Function to create an test CSV file -fn write_csv_file(mut file: File) { - // Create the data to put into the csv file with headers - let content = r#"id,time,vote,unixtime,rating -a1,"10 6, 2013",3,1381017600,5.0 -a2,"08 9, 2013",2,1376006400,4.5"#; - // write the data - file.write_all(content.as_ref()) - .expect("Problem with writing file!"); -} +/// Use the DataFrame API to +/// 1. Read CSV files +/// 2. Optionally specify schema +async fn read_csv(ctx: &SessionContext) -> Result<()> { + // create example.csv file in a temporary directory + let dir = tempdir()?; + let file_path = dir.path().join("example.csv"); + { + let mut file = File::create(&file_path)?; + // write CSV data + file.write_all( + r#"id,time,vote,unixtime,rating + a1,"10 6, 2013",3,1381017600,5.0 + a2,"08 9, 2013",2,1376006400,4.5"# + .as_bytes(), + )?; + } // scope closes the file + let file_path = file_path.to_str().unwrap(); -// Example to read data from a csv file with inferred schema -async fn example_read_csv_file_with_inferred_schema(file_path: &str) -> DataFrame { - // Create a session context - let ctx = SessionContext::new(); - // Register a lazy DataFrame using the context - ctx.read_csv(file_path, CsvReadOptions::default()) - .await - .unwrap() -} + // You can read a CSV file and DataFusion will infer the schema automatically + let csv_df = ctx.read_csv(file_path, CsvReadOptions::default()).await?; + csv_df.show().await?; -// Example to read csv file with a defined schema for the csv file -async fn example_read_csv_file_with_schema(file_path: &str) -> DataFrame { - // Create a session context - let ctx = SessionContext::new(); - // Define the schema + // If you know the types of your data you can specify them explicitly let schema = Schema::new(vec![ Field::new("id", DataType::Utf8, false), Field::new("time", DataType::Utf8, false), @@ -112,6 +104,38 @@ async fn example_read_csv_file_with_schema(file_path: &str) -> DataFrame { schema: Some(&schema), ..Default::default() }; - // Register a lazy DataFrame by using the context and option provider - ctx.read_csv(file_path, csv_read_option).await.unwrap() + let csv_df = ctx.read_csv(file_path, csv_read_option).await?; + csv_df.show().await?; + + // You can also create DataFrames from the result of sql queries + // and using the `enable_url_table` refer to local files directly + let dyn_ctx = ctx.clone().enable_url_table(); + let csv_df = dyn_ctx + .sql(&format!("SELECT rating, unixtime FROM '{}'", file_path)) + .await?; + csv_df.show().await?; + + Ok(()) +} + +/// Use the DataFrame API to: +/// 1. Read in-memory data. +async fn read_memory(ctx: &SessionContext) -> Result<()> { + // define data in memory + let a: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d"])); + let b: ArrayRef = Arc::new(Int32Array::from(vec![1, 10, 10, 100])); + let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?; + + // declare a table in memory. In Apache Spark API, this corresponds to createDataFrame(...). + ctx.register_batch("t", batch)?; + let df = ctx.table("t").await?; + + // construct an expression corresponding to "SELECT a, b FROM t WHERE b = 10" in SQL + let filter = col("b").eq(lit(10)); + let df = df.select_columns(&["a", "b"])?.filter(filter)?; + + // print the results + df.show().await?; + + Ok(()) } diff --git a/datafusion-examples/examples/dataframe_in_memory.rs b/datafusion-examples/examples/dataframe_in_memory.rs deleted file mode 100644 index c57c38870a7e..000000000000 --- a/datafusion-examples/examples/dataframe_in_memory.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use datafusion::arrow::array::{Int32Array, StringArray}; -use datafusion::arrow::datatypes::{DataType, Field, Schema}; -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::error::Result; -use datafusion::prelude::*; - -/// This example demonstrates how to use the DataFrame API against in-memory data. -#[tokio::main] -async fn main() -> Result<()> { - // define a schema. - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::Int32, false), - ])); - - // define data. - let batch = RecordBatch::try_new( - schema, - vec![ - Arc::new(StringArray::from(vec!["a", "b", "c", "d"])), - Arc::new(Int32Array::from(vec![1, 10, 10, 100])), - ], - )?; - - // declare a new context. In spark API, this corresponds to a new spark SQLsession - let ctx = SessionContext::new(); - - // declare a table in memory. In spark API, this corresponds to createDataFrame(...). - ctx.register_batch("t", batch)?; - let df = ctx.table("t").await?; - - // construct an expression corresponding to "SELECT a, b FROM t WHERE b = 10" in SQL - let filter = col("b").eq(lit(10)); - - let df = df.select_columns(&["a", "b"])?.filter(filter)?; - - // print the results - df.show().await?; - - Ok(()) -} From de450d46068bbc6f4e09df4fc156e5376c4f5ff1 Mon Sep 17 00:00:00 2001 From: glfeng Date: Fri, 15 Nov 2024 05:09:46 +0800 Subject: [PATCH 13/17] parquet:Add file_extension for specify file_extension of ParquetReadOptions (#13353) * Add file_extension for specify file_extension of ParquetReadOptions * remove whitespace in #270 --- datafusion/core/src/datasource/file_format/options.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index 1e0e28ef88cb..e392515cacb1 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -262,6 +262,12 @@ impl<'a> ParquetReadOptions<'a> { Default::default() } + /// Specify file_extension + pub fn file_extension(mut self, file_extension: &'a str) -> Self { + self.file_extension = file_extension; + self + } + /// Specify parquet_pruning pub fn parquet_pruning(mut self, parquet_pruning: bool) -> Self { self.parquet_pruning = Some(parquet_pruning); From 57235c2fedc2b428e1e515c3ba713c6e9bc17672 Mon Sep 17 00:00:00 2001 From: Shehab Amin <11789402+shehabgamin@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:13:31 -0800 Subject: [PATCH 14/17] Add getters to `ExecutionPlan` Properties (#13409) * Expose Execution Plan Properties * Expose Execution Plan Properties * Expose Execution Plan Properties * Expose Execution Plan Properties * Expose Execution Plan Properties --- .../core/src/datasource/physical_plan/json.rs | 5 +++++ .../src/joins/sort_merge_join.rs | 18 +++++++++++++++++ datafusion/physical-plan/src/memory.rs | 13 ++++++++++++ .../physical-plan/src/recursive_query.rs | 20 +++++++++++++++++++ .../physical-plan/src/sorts/partial_sort.rs | 5 +++++ datafusion/physical-plan/src/work_table.rs | 10 ++++++++++ 6 files changed, 71 insertions(+) diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 6cb9d9df7047..7b0a605aed05 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -86,6 +86,11 @@ impl NdJsonExec { &self.base_config } + /// Ref to file compression type + pub fn file_compression_type(&self) -> &FileCompressionType { + &self.file_compression_type + } + fn output_partitioning_helper(file_scan_config: &FileScanConfig) -> Partitioning { Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()) } diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index a01cd348f0c5..5b1a29665868 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -204,18 +204,36 @@ impl SortMergeJoinExec { &self.on } + /// Ref to right execution plan pub fn right(&self) -> &Arc { &self.right } + /// Join type pub fn join_type(&self) -> JoinType { self.join_type } + /// Ref to left execution plan pub fn left(&self) -> &Arc { &self.left } + /// Ref to join filter + pub fn filter(&self) -> &Option { + &self.filter + } + + /// Ref to sort options + pub fn sort_options(&self) -> &[SortOptions] { + &self.sort_options + } + + /// Null equals null + pub fn null_equals_null(&self) -> bool { + self.null_equals_null + } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn compute_properties( left: &Arc, diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index c9ada345afc7..272dcdc95bc0 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -178,14 +178,26 @@ impl MemoryExec { self } + /// Ref to partitions pub fn partitions(&self) -> &[Vec] { &self.partitions } + /// Ref to projection pub fn projection(&self) -> &Option> { &self.projection } + /// Show sizes + pub fn show_sizes(&self) -> bool { + self.show_sizes + } + + /// Ref to sort information + pub fn sort_information(&self) -> &[LexOrdering] { + &self.sort_information + } + /// A memory table can be ordered by multiple expressions simultaneously. /// [`EquivalenceProperties`] keeps track of expressions that describe the /// global ordering of the schema. These columns are not necessarily same; e.g. @@ -261,6 +273,7 @@ impl MemoryExec { Ok(self) } + /// Arc clone of ref to original schema pub fn original_schema(&self) -> SchemaRef { Arc::clone(&self.schema) } diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index cbf22a4b392f..0137e5d52fea 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -95,6 +95,26 @@ impl RecursiveQueryExec { }) } + /// Ref to name + pub fn name(&self) -> &str { + &self.name + } + + /// Ref to static term + pub fn static_term(&self) -> &Arc { + &self.static_term + } + + /// Ref to recursive term + pub fn recursive_term(&self) -> &Arc { + &self.recursive_term + } + + /// is distinct + pub fn is_distinct(&self) -> bool { + self.is_distinct + } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index e69989c1be91..dde19f46cd4d 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -167,6 +167,11 @@ impl PartialSortExec { self.fetch } + /// Common prefix length + pub fn common_prefix_length(&self) -> usize { + self.common_prefix_length + } + fn output_partitioning_helper( input: &Arc, preserve_partitioning: bool, diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index 61d444171cc7..add386319253 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -120,6 +120,16 @@ impl WorkTableExec { } } + /// Ref to name + pub fn name(&self) -> &str { + &self.name + } + + /// Arc clone of ref to schema + pub fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + pub(super) fn with_work_table(&self, work_table: Arc) -> Self { Self { name: self.name.clone(), From e25f5e7485ffcd810f96c7be096b04b3cacf30b3 Mon Sep 17 00:00:00 2001 From: Leonardo Yvens Date: Fri, 15 Nov 2024 01:39:46 +0000 Subject: [PATCH 15/17] impl table_type for DefaultTableSource (#13416) --- .../src/datasource/default_table_source.rs | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs index c37c3b97f4fe..5efabd000d68 100644 --- a/datafusion/core/src/datasource/default_table_source.rs +++ b/datafusion/core/src/datasource/default_table_source.rs @@ -24,7 +24,7 @@ use crate::datasource::TableProvider; use arrow::datatypes::SchemaRef; use datafusion_common::{internal_err, Constraints}; -use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource}; +use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource, TableType}; /// DataFusion default table source, wrapping TableProvider. /// @@ -61,6 +61,11 @@ impl TableSource for DefaultTableSource { self.table_provider.constraints() } + /// Get the type of this table for metadata/catalog purposes. + fn table_type(&self) -> TableType { + self.table_provider.table_type() + } + /// Tests whether the table provider can make use of any or all filter expressions /// to optimise data retrieval. fn supports_filters_pushdown( @@ -100,3 +105,41 @@ pub fn source_as_provider( _ => internal_err!("TableSource was not DefaultTableSource"), } } + +#[test] +fn preserves_table_type() { + use async_trait::async_trait; + use datafusion_common::DataFusionError; + + #[derive(Debug)] + struct TestTempTable; + + #[async_trait] + impl TableProvider for TestTempTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_type(&self) -> TableType { + TableType::Temporary + } + + fn schema(&self) -> SchemaRef { + unimplemented!() + } + + async fn scan( + &self, + _: &dyn datafusion_catalog::Session, + _: Option<&Vec>, + _: &[Expr], + _: Option, + ) -> Result, DataFusionError> + { + unimplemented!() + } + } + + let table_source = DefaultTableSource::new(Arc::new(TestTempTable)); + assert_eq!(table_source.table_type(), TableType::Temporary); +} From 75a27a8c14c5244fa213f1b81dffa35bc4527c73 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Nov 2024 09:59:07 -0500 Subject: [PATCH 16/17] Remove `BuiltInWindowFunction` (LogicalPlans) (#13393) * Remove BuiltInWindowFunction * fix docs * Fix typo --- .../expr/src/built_in_window_function.rs | 131 ------------------ datafusion/expr/src/expr.rs | 45 ++---- datafusion/expr/src/expr_schema.rs | 6 - datafusion/expr/src/lib.rs | 2 - datafusion/expr/src/udwf.rs | 12 +- datafusion/physical-plan/src/windows/mod.rs | 4 - datafusion/proto/proto/datafusion.proto | 19 +-- datafusion/proto/src/generated/pbjson.rs | 98 ------------- datafusion/proto/src/generated/prost.rs | 34 +---- .../proto/src/logical_plan/from_proto.rs | 29 +--- datafusion/proto/src/logical_plan/to_proto.rs | 1 - .../proto/src/physical_plan/from_proto.rs | 9 -- 12 files changed, 27 insertions(+), 363 deletions(-) delete mode 100644 datafusion/expr/src/built_in_window_function.rs diff --git a/datafusion/expr/src/built_in_window_function.rs b/datafusion/expr/src/built_in_window_function.rs deleted file mode 100644 index ab41395ad371..000000000000 --- a/datafusion/expr/src/built_in_window_function.rs +++ /dev/null @@ -1,131 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Built-in functions module contains all the built-in functions definitions. - -use std::fmt; -use std::str::FromStr; - -use crate::type_coercion::functions::data_types; -use crate::utils; -use crate::{Signature, Volatility}; -use datafusion_common::{plan_datafusion_err, plan_err, DataFusionError, Result}; - -use arrow::datatypes::DataType; - -use strum_macros::EnumIter; - -impl fmt::Display for BuiltInWindowFunction { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.name()) - } -} - -/// A [window function] built in to DataFusion -/// -/// [Window Function]: https://en.wikipedia.org/wiki/Window_function_(SQL) -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash, EnumIter)] -pub enum BuiltInWindowFunction { - /// returns value evaluated at the row that is the first row of the window frame - FirstValue, - /// Returns value evaluated at the row that is the last row of the window frame - LastValue, - /// Returns value evaluated at the row that is the nth row of the window frame (counting from 1); returns null if no such row - NthValue, -} - -impl BuiltInWindowFunction { - pub fn name(&self) -> &str { - use BuiltInWindowFunction::*; - match self { - FirstValue => "first_value", - LastValue => "last_value", - NthValue => "NTH_VALUE", - } - } -} - -impl FromStr for BuiltInWindowFunction { - type Err = DataFusionError; - fn from_str(name: &str) -> Result { - Ok(match name.to_uppercase().as_str() { - "FIRST_VALUE" => BuiltInWindowFunction::FirstValue, - "LAST_VALUE" => BuiltInWindowFunction::LastValue, - "NTH_VALUE" => BuiltInWindowFunction::NthValue, - _ => return plan_err!("There is no built-in window function named {name}"), - }) - } -} - -/// Returns the datatype of the built-in window function -impl BuiltInWindowFunction { - pub fn return_type(&self, input_expr_types: &[DataType]) -> Result { - // Note that this function *must* return the same type that the respective physical expression returns - // or the execution panics. - - // Verify that this is a valid set of data types for this function - data_types(input_expr_types, &self.signature()) - // Original errors are all related to wrong function signature - // Aggregate them for better error message - .map_err(|_| { - plan_datafusion_err!( - "{}", - utils::generate_signature_error_msg( - &format!("{self}"), - self.signature(), - input_expr_types, - ) - ) - })?; - - match self { - BuiltInWindowFunction::FirstValue - | BuiltInWindowFunction::LastValue - | BuiltInWindowFunction::NthValue => Ok(input_expr_types[0].clone()), - } - } - - /// The signatures supported by the built-in window function `fun`. - pub fn signature(&self) -> Signature { - // Note: The physical expression must accept the type returned by this function or the execution panics. - match self { - BuiltInWindowFunction::FirstValue | BuiltInWindowFunction::LastValue => { - Signature::any(1, Volatility::Immutable) - } - BuiltInWindowFunction::NthValue => Signature::any(2, Volatility::Immutable), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use strum::IntoEnumIterator; - #[test] - // Test for BuiltInWindowFunction's Display and from_str() implementations. - // For each variant in BuiltInWindowFunction, it converts the variant to a string - // and then back to a variant. The test asserts that the original variant and - // the reconstructed variant are the same. This assertion is also necessary for - // function suggestion. See https://github.com/apache/datafusion/issues/8082 - fn test_display_and_from_str() { - for func_original in BuiltInWindowFunction::iter() { - let func_name = func_original.to_string(); - let func_from_str = BuiltInWindowFunction::from_str(&func_name).unwrap(); - assert_eq!(func_from_str, func_original); - } - } -} diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 4042d7888b64..83d35c3d25b1 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -27,10 +27,7 @@ use crate::expr_fn::binary_expr; use crate::logical_plan::Subquery; use crate::utils::expr_to_columns; use crate::Volatility; -use crate::{ - udaf, BuiltInWindowFunction, ExprSchemable, Operator, Signature, WindowFrame, - WindowUDF, -}; +use crate::{udaf, ExprSchemable, Operator, Signature, WindowFrame, WindowUDF}; use arrow::datatypes::{DataType, FieldRef}; use datafusion_common::cse::HashNode; @@ -697,13 +694,13 @@ impl AggregateFunction { } } -/// WindowFunction +/// A function used as a SQL window function +/// +/// In SQL, you can use: +/// - Actual window functions ([`WindowUDF`]) +/// - Normal aggregate functions ([`AggregateUDF`]) #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] -/// Defines which implementation of an aggregate function DataFusion should call. pub enum WindowFunctionDefinition { - /// A built in aggregate function that leverages an aggregate function - /// A a built-in window function - BuiltInWindowFunction(BuiltInWindowFunction), /// A user defined aggregate function AggregateUDF(Arc), /// A user defined aggregate function @@ -719,9 +716,6 @@ impl WindowFunctionDefinition { display_name: &str, ) -> Result { match self { - WindowFunctionDefinition::BuiltInWindowFunction(fun) => { - fun.return_type(input_expr_types) - } WindowFunctionDefinition::AggregateUDF(fun) => { fun.return_type(input_expr_types) } @@ -734,7 +728,6 @@ impl WindowFunctionDefinition { /// The signatures supported by the function `fun`. pub fn signature(&self) -> Signature { match self { - WindowFunctionDefinition::BuiltInWindowFunction(fun) => fun.signature(), WindowFunctionDefinition::AggregateUDF(fun) => fun.signature().clone(), WindowFunctionDefinition::WindowUDF(fun) => fun.signature().clone(), } @@ -743,7 +736,6 @@ impl WindowFunctionDefinition { /// Function's name for display pub fn name(&self) -> &str { match self { - WindowFunctionDefinition::BuiltInWindowFunction(fun) => fun.name(), WindowFunctionDefinition::WindowUDF(fun) => fun.name(), WindowFunctionDefinition::AggregateUDF(fun) => fun.name(), } @@ -753,19 +745,12 @@ impl WindowFunctionDefinition { impl Display for WindowFunctionDefinition { fn fmt(&self, f: &mut Formatter) -> fmt::Result { match self { - WindowFunctionDefinition::BuiltInWindowFunction(fun) => Display::fmt(fun, f), WindowFunctionDefinition::AggregateUDF(fun) => Display::fmt(fun, f), WindowFunctionDefinition::WindowUDF(fun) => Display::fmt(fun, f), } } } -impl From for WindowFunctionDefinition { - fn from(value: BuiltInWindowFunction) -> Self { - Self::BuiltInWindowFunction(value) - } -} - impl From> for WindowFunctionDefinition { fn from(value: Arc) -> Self { Self::AggregateUDF(value) @@ -780,26 +765,16 @@ impl From> for WindowFunctionDefinition { /// Window function /// -/// Holds the actual actual function to call [`WindowFunction`] as well as its +/// Holds the actual function to call [`WindowFunction`] as well as its /// arguments (`args`) and the contents of the `OVER` clause: /// /// 1. `PARTITION BY` /// 2. `ORDER BY` /// 3. Window frame (e.g. `ROWS 1 PRECEDING AND 1 FOLLOWING`) /// -/// # Example -/// ``` -/// # use datafusion_expr::{Expr, BuiltInWindowFunction, col, ExprFunctionExt}; -/// # use datafusion_expr::expr::WindowFunction; -/// // Create FIRST_VALUE(a) OVER (PARTITION BY b ORDER BY c) -/// let expr = Expr::WindowFunction( -/// WindowFunction::new(BuiltInWindowFunction::FirstValue, vec![col("a")]) -/// ) -/// .partition_by(vec![col("b")]) -/// .order_by(vec![col("b").sort(true, true)]) -/// .build() -/// .unwrap(); -/// ``` +/// See [`ExprFunctionExt`] for examples of how to create a `WindowFunction`. +/// +/// [`ExprFunctionExt`]: crate::ExprFunctionExt #[derive(Clone, PartialEq, Eq, PartialOrd, Hash, Debug)] pub struct WindowFunction { /// Name of the function diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 2225f457f626..b1a461eca41d 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -481,12 +481,6 @@ impl Expr { .map(|e| e.get_type(schema)) .collect::>>()?; match fun { - WindowFunctionDefinition::BuiltInWindowFunction(window_fun) => { - let return_type = window_fun.return_type(&data_types)?; - let nullable = - !["RANK", "NTILE", "CUME_DIST"].contains(&window_fun.name()); - Ok((return_type, nullable)) - } WindowFunctionDefinition::AggregateUDF(udaf) => { let new_types = data_types_with_aggregate_udf(&data_types, udaf) .map_err(|err| { diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 3faa8192f3eb..27b2d71b1f42 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -28,7 +28,6 @@ //! //! The [expr_fn] module contains functions for creating expressions. -mod built_in_window_function; mod literal; mod operation; mod partition_evaluator; @@ -67,7 +66,6 @@ pub mod var_provider; pub mod window_frame; pub mod window_state; -pub use built_in_window_function::BuiltInWindowFunction; pub use datafusion_expr_common::accumulator::Accumulator; pub use datafusion_expr_common::columnar_value::ColumnarValue; pub use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator}; diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 124625280670..475b864a8a18 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -39,8 +39,16 @@ use datafusion_functions_window_common::field::WindowUDFFieldArgs; use datafusion_functions_window_common::partition::PartitionEvaluatorArgs; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; -/// Logical representation of a user-defined window function (UDWF) -/// A UDWF is different from a UDF in that it is stateful across batches. +/// Logical representation of a user-defined window function (UDWF). +/// +/// A Window Function is called via the SQL `OVER` clause: +/// +/// ```sql +/// SELECT first_value(col) OVER (PARTITION BY a, b ORDER BY c) FROM foo; +/// ``` +/// +/// A UDWF is different from a user defined function (UDF) in that it is +/// stateful across batches. /// /// See the documentation on [`PartitionEvaluator`] for more details /// diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index d2eb14638c71..a323a958cc76 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -103,9 +103,6 @@ pub fn create_window_expr( ignore_nulls: bool, ) -> Result> { Ok(match fun { - WindowFunctionDefinition::BuiltInWindowFunction(_fun) => { - unreachable!() - } WindowFunctionDefinition::AggregateUDF(fun) => { let aggregate = AggregateExprBuilder::new(Arc::clone(fun), args.to_vec()) .schema(Arc::new(input_schema.clone())) @@ -120,7 +117,6 @@ pub fn create_window_expr( aggregate, ) } - // TODO: Ordering not supported for Window UDFs yet WindowFunctionDefinition::WindowUDF(fun) => Arc::new(BuiltInWindowExpr::new( create_udwf_window_expr(fun, args, input_schema, name, ignore_nulls)?, partition_by, diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 998c172f6ef4..6606b1e93f02 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -507,24 +507,9 @@ message ScalarUDFExprNode { optional bytes fun_definition = 3; } -enum BuiltInWindowFunction { - UNSPECIFIED = 0; // https://protobuf.dev/programming-guides/dos-donts/#unspecified-enum - // ROW_NUMBER = 0; - // RANK = 1; - // DENSE_RANK = 2; - // PERCENT_RANK = 3; - // CUME_DIST = 4; - // NTILE = 5; - // LAG = 6; - // LEAD = 7; - // FIRST_VALUE = 8; - // LAST_VALUE = 9; - // NTH_VALUE = 10; -} - message WindowExprNode { oneof window_function { - BuiltInWindowFunction built_in_function = 2; + // BuiltInWindowFunction built_in_function = 2; string udaf = 3; string udwf = 9; } @@ -866,7 +851,7 @@ message PhysicalAggregateExprNode { message PhysicalWindowExprNode { oneof window_function { - BuiltInWindowFunction built_in_function = 2; + // BuiltInWindowFunction built_in_function = 2; string user_defined_aggr_function = 3; } repeated PhysicalExprNode args = 4; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index b5447ad6f473..09c873b1f98a 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -1654,74 +1654,6 @@ impl<'de> serde::Deserialize<'de> for BinaryExprNode { deserializer.deserialize_struct("datafusion.BinaryExprNode", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for BuiltInWindowFunction { - #[allow(deprecated)] - fn serialize(&self, serializer: S) -> std::result::Result - where - S: serde::Serializer, - { - let variant = match self { - Self::Unspecified => "UNSPECIFIED", - }; - serializer.serialize_str(variant) - } -} -impl<'de> serde::Deserialize<'de> for BuiltInWindowFunction { - #[allow(deprecated)] - fn deserialize(deserializer: D) -> std::result::Result - where - D: serde::Deserializer<'de>, - { - const FIELDS: &[&str] = &[ - "UNSPECIFIED", - ]; - - struct GeneratedVisitor; - - impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = BuiltInWindowFunction; - - fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "expected one of: {:?}", &FIELDS) - } - - fn visit_i64(self, v: i64) -> std::result::Result - where - E: serde::de::Error, - { - i32::try_from(v) - .ok() - .and_then(|x| x.try_into().ok()) - .ok_or_else(|| { - serde::de::Error::invalid_value(serde::de::Unexpected::Signed(v), &self) - }) - } - - fn visit_u64(self, v: u64) -> std::result::Result - where - E: serde::de::Error, - { - i32::try_from(v) - .ok() - .and_then(|x| x.try_into().ok()) - .ok_or_else(|| { - serde::de::Error::invalid_value(serde::de::Unexpected::Unsigned(v), &self) - }) - } - - fn visit_str(self, value: &str) -> std::result::Result - where - E: serde::de::Error, - { - match value { - "UNSPECIFIED" => Ok(BuiltInWindowFunction::Unspecified), - _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), - } - } - } - deserializer.deserialize_any(GeneratedVisitor) - } -} impl serde::Serialize for CaseNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result @@ -16391,11 +16323,6 @@ impl serde::Serialize for PhysicalWindowExprNode { } if let Some(v) = self.window_function.as_ref() { match v { - physical_window_expr_node::WindowFunction::BuiltInFunction(v) => { - let v = BuiltInWindowFunction::try_from(*v) - .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; - struct_ser.serialize_field("builtInFunction", &v)?; - } physical_window_expr_node::WindowFunction::UserDefinedAggrFunction(v) => { struct_ser.serialize_field("userDefinedAggrFunction", v)?; } @@ -16421,8 +16348,6 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode { "name", "fun_definition", "funDefinition", - "built_in_function", - "builtInFunction", "user_defined_aggr_function", "userDefinedAggrFunction", ]; @@ -16435,7 +16360,6 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode { WindowFrame, Name, FunDefinition, - BuiltInFunction, UserDefinedAggrFunction, } impl<'de> serde::Deserialize<'de> for GeneratedField { @@ -16464,7 +16388,6 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode { "windowFrame" | "window_frame" => Ok(GeneratedField::WindowFrame), "name" => Ok(GeneratedField::Name), "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition), - "builtInFunction" | "built_in_function" => Ok(GeneratedField::BuiltInFunction), "userDefinedAggrFunction" | "user_defined_aggr_function" => Ok(GeneratedField::UserDefinedAggrFunction), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } @@ -16532,12 +16455,6 @@ impl<'de> serde::Deserialize<'de> for PhysicalWindowExprNode { map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0) ; } - GeneratedField::BuiltInFunction => { - if window_function__.is_some() { - return Err(serde::de::Error::duplicate_field("builtInFunction")); - } - window_function__ = map_.next_value::<::std::option::Option>()?.map(|x| physical_window_expr_node::WindowFunction::BuiltInFunction(x as i32)); - } GeneratedField::UserDefinedAggrFunction => { if window_function__.is_some() { return Err(serde::de::Error::duplicate_field("userDefinedAggrFunction")); @@ -21475,11 +21392,6 @@ impl serde::Serialize for WindowExprNode { } if let Some(v) = self.window_function.as_ref() { match v { - window_expr_node::WindowFunction::BuiltInFunction(v) => { - let v = BuiltInWindowFunction::try_from(*v) - .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; - struct_ser.serialize_field("builtInFunction", &v)?; - } window_expr_node::WindowFunction::Udaf(v) => { struct_ser.serialize_field("udaf", v)?; } @@ -21507,8 +21419,6 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { "windowFrame", "fun_definition", "funDefinition", - "built_in_function", - "builtInFunction", "udaf", "udwf", ]; @@ -21520,7 +21430,6 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { OrderBy, WindowFrame, FunDefinition, - BuiltInFunction, Udaf, Udwf, } @@ -21549,7 +21458,6 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { "orderBy" | "order_by" => Ok(GeneratedField::OrderBy), "windowFrame" | "window_frame" => Ok(GeneratedField::WindowFrame), "funDefinition" | "fun_definition" => Ok(GeneratedField::FunDefinition), - "builtInFunction" | "built_in_function" => Ok(GeneratedField::BuiltInFunction), "udaf" => Ok(GeneratedField::Udaf), "udwf" => Ok(GeneratedField::Udwf), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), @@ -21611,12 +21519,6 @@ impl<'de> serde::Deserialize<'de> for WindowExprNode { map_.next_value::<::std::option::Option<::pbjson::private::BytesDeserialize<_>>>()?.map(|x| x.0) ; } - GeneratedField::BuiltInFunction => { - if window_function__.is_some() { - return Err(serde::de::Error::duplicate_field("builtInFunction")); - } - window_function__ = map_.next_value::<::std::option::Option>()?.map(|x| window_expr_node::WindowFunction::BuiltInFunction(x as i32)); - } GeneratedField::Udaf => { if window_function__.is_some() { return Err(serde::de::Error::duplicate_field("udaf")); diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 40bc8bd9eaf5..ad5320fc657c 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -742,15 +742,14 @@ pub struct WindowExprNode { pub window_frame: ::core::option::Option, #[prost(bytes = "vec", optional, tag = "10")] pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec>, - #[prost(oneof = "window_expr_node::WindowFunction", tags = "2, 3, 9")] + #[prost(oneof = "window_expr_node::WindowFunction", tags = "3, 9")] pub window_function: ::core::option::Option, } /// Nested message and enum types in `WindowExprNode`. pub mod window_expr_node { #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum WindowFunction { - #[prost(enumeration = "super::BuiltInWindowFunction", tag = "2")] - BuiltInFunction(i32), + /// BuiltInWindowFunction built_in_function = 2; #[prost(string, tag = "3")] Udaf(::prost::alloc::string::String), #[prost(string, tag = "9")] @@ -1267,7 +1266,7 @@ pub struct PhysicalWindowExprNode { pub name: ::prost::alloc::string::String, #[prost(bytes = "vec", optional, tag = "9")] pub fun_definition: ::core::option::Option<::prost::alloc::vec::Vec>, - #[prost(oneof = "physical_window_expr_node::WindowFunction", tags = "2, 3")] + #[prost(oneof = "physical_window_expr_node::WindowFunction", tags = "3")] pub window_function: ::core::option::Option< physical_window_expr_node::WindowFunction, >, @@ -1276,8 +1275,7 @@ pub struct PhysicalWindowExprNode { pub mod physical_window_expr_node { #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum WindowFunction { - #[prost(enumeration = "super::BuiltInWindowFunction", tag = "2")] - BuiltInFunction(i32), + /// BuiltInWindowFunction built_in_function = 2; #[prost(string, tag = "3")] UserDefinedAggrFunction(::prost::alloc::string::String), } @@ -1837,30 +1835,6 @@ pub struct CteWorkTableScanNode { } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] -pub enum BuiltInWindowFunction { - /// - Unspecified = 0, -} -impl BuiltInWindowFunction { - /// String value of the enum field names used in the ProtoBuf definition. - /// - /// The values are not transformed in any way and thus are considered stable - /// (if the ProtoBuf definition does not change) and safe for programmatic use. - pub fn as_str_name(&self) -> &'static str { - match self { - Self::Unspecified => "UNSPECIFIED", - } - } - /// Creates an enum from field names used in the ProtoBuf definition. - pub fn from_str_name(value: &str) -> ::core::option::Option { - match value { - "UNSPECIFIED" => Some(Self::Unspecified), - _ => None, - } - } -} -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] -#[repr(i32)] pub enum WindowFrameUnits { Rows = 0, Range = 1, diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 4708e49d4565..301efc42a7c4 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -28,7 +28,7 @@ use datafusion_expr::ExprFunctionExt; use datafusion_expr::{ expr::{self, InList, WindowFunction}, logical_plan::{PlanType, StringifiedPlan}, - Between, BinaryExpr, BuiltInWindowFunction, Case, Cast, Expr, GroupingSet, + Between, BinaryExpr, Case, Cast, Expr, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, WindowFrameUnits, @@ -148,14 +148,6 @@ impl From<&protobuf::StringifiedPlan> for StringifiedPlan { } } -impl From for BuiltInWindowFunction { - fn from(built_in_function: protobuf::BuiltInWindowFunction) -> Self { - match built_in_function { - protobuf::BuiltInWindowFunction::Unspecified => todo!(), - } - } -} - impl TryFrom for WindowFrame { type Error = Error; @@ -285,25 +277,6 @@ pub fn parse_expr( // TODO: support proto for null treatment match window_function { - window_expr_node::WindowFunction::BuiltInFunction(i) => { - let built_in_function = protobuf::BuiltInWindowFunction::try_from(*i) - .map_err(|_| Error::unknown("BuiltInWindowFunction", *i))? - .into(); - - let args = parse_exprs(&expr.exprs, registry, codec)?; - - Expr::WindowFunction(WindowFunction::new( - expr::WindowFunctionDefinition::BuiltInWindowFunction( - built_in_function, - ), - args, - )) - .partition_by(partition_by) - .order_by(order_by) - .window_frame(window_frame) - .build() - .map_err(Error::DataFusionError) - } window_expr_node::WindowFunction::Udaf(udaf_name) => { let udaf_function = match &expr.fun_definition { Some(buf) => codec.try_decode_udaf(udaf_name, buf)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 5ef64675280e..caceb3db164c 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -306,7 +306,6 @@ pub fn serialize_expr( null_treatment: _, }) => { let (window_function, fun_definition) = match fun { - WindowFunctionDefinition::BuiltInWindowFunction(_fun) => unreachable!(), WindowFunctionDefinition::AggregateUDF(aggr_udf) => { let mut buf = Vec::new(); let _ = codec.try_encode_udaf(aggr_udf, &mut buf); diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 31b59c2a9457..1c5bdd0c02ba 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -146,15 +146,6 @@ pub fn parse_physical_window_expr( let fun = if let Some(window_func) = proto.window_function.as_ref() { match window_func { - protobuf::physical_window_expr_node::WindowFunction::BuiltInFunction(n) => { - let f = protobuf::BuiltInWindowFunction::try_from(*n).map_err(|_| { - proto_error(format!( - "Received an unknown window builtin function: {n}" - )) - })?; - - WindowFunctionDefinition::BuiltInWindowFunction(f.into()) - } protobuf::physical_window_expr_node::WindowFunction::UserDefinedAggrFunction(udaf_name) => { WindowFunctionDefinition::AggregateUDF(match &proto.fun_definition { Some(buf) => codec.try_decode_udaf(udaf_name, buf)?, From 8c352708d06c0bde7f9e92cda06efd69b50f16f0 Mon Sep 17 00:00:00 2001 From: Dima <111751109+Dimchikkk@users.noreply.github.com> Date: Fri, 15 Nov 2024 15:01:12 +0000 Subject: [PATCH 17/17] Fix `regex` cache on pattern, less alloc, hash less often (#13414) * cache on pattern, less alloc, hash less often * inline get_pattern * reduce to one hash * remove unnecessary lifetimes --- datafusion/functions/src/regex/regexpcount.rs | 50 +++++++++++++++---- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index 1286c6b5b1bc..8da154430fc5 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -30,6 +30,7 @@ use datafusion_expr::{ }; use itertools::izip; use regex::Regex; +use std::collections::hash_map::Entry; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; @@ -548,16 +549,22 @@ where } } -fn compile_and_cache_regex<'a>( - regex: &'a str, - flags: Option<&'a str>, - regex_cache: &'a mut HashMap, -) -> Result<&'a Regex, ArrowError> { - if !regex_cache.contains_key(regex) { - let compiled = compile_regex(regex, flags)?; - regex_cache.insert(regex.to_string(), compiled); - } - Ok(regex_cache.get(regex).unwrap()) +fn compile_and_cache_regex<'strings, 'cache>( + regex: &'strings str, + flags: Option<&'strings str>, + regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>, +) -> Result<&'cache Regex, ArrowError> +where + 'strings: 'cache, +{ + let result = match regex_cache.entry((regex, flags)) { + Entry::Occupied(occupied_entry) => occupied_entry.into_mut(), + Entry::Vacant(vacant_entry) => { + let compiled = compile_regex(regex, flags)?; + vacant_entry.insert(compiled) + } + }; + Ok(result) } fn compile_regex(regex: &str, flags: Option<&str>) -> Result { @@ -634,6 +641,8 @@ mod tests { test_case_sensitive_regexp_count_array_complex::>(); test_case_sensitive_regexp_count_array_complex::>(); test_case_sensitive_regexp_count_array_complex::(); + + test_case_regexp_count_cache_check::>(); } fn test_case_sensitive_regexp_count_scalar() { @@ -977,4 +986,25 @@ mod tests { .unwrap(); assert_eq!(re.as_ref(), &expected); } + + fn test_case_regexp_count_cache_check() + where + A: From> + Array + 'static, + { + let values = A::from(vec!["aaa", "Aaa", "aaa"]); + let regex = A::from(vec!["aaa", "aaa", "aaa"]); + let start = Int64Array::from(vec![1, 1, 1]); + let flags = A::from(vec!["", "i", ""]); + + let expected = Int64Array::from(vec![1, 1, 1]); + + let re = regexp_count_func(&[ + Arc::new(values), + Arc::new(regex), + Arc::new(start), + Arc::new(flags), + ]) + .unwrap(); + assert_eq!(re.as_ref(), &expected); + } }