apache · planga82 · Jun 29, 2024 · Jun 29, 2024 · Jun 30, 2024 · Jun 30, 2024
diff --git a/core/src/execution/datafusion/expressions/binary.rs b/core/src/execution/datafusion/expressions/binary.rs
@@ -0,0 +1,202 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::fmt::{Display, Formatter};
+use std::hash::Hasher;
+use std::sync::Arc;
+
+use arrow_array::{BooleanArray, RecordBatch};
+use arrow_schema::{DataType, Schema};
+use datafusion_common::{DataFusionError, Result, ScalarValue};
+use datafusion_expr::interval_arithmetic::Interval;
+use datafusion_expr::sort_properties::ExprProperties;
+use datafusion_expr::{ColumnarValue, Operator};
+use datafusion_physical_expr::expressions::{BinaryExpr, Literal};
+use datafusion_physical_expr_common::physical_expr::{down_cast_any_ref, PhysicalExpr};
+
+use crate::execution::datafusion::expressions::EvalMode;
+
+use super::arithmetic_overflow_error;
+
+#[derive(Debug, Hash, Clone)]
+pub struct CometBinaryExpr {
+    left: Arc<dyn PhysicalExpr>,
+    op: Operator,
+    right: Arc<dyn PhysicalExpr>,
+    eval_mode: EvalMode,
+    inner: Arc<BinaryExpr>,
+}
+
+impl CometBinaryExpr {
+    pub fn new(
+        left: Arc<dyn PhysicalExpr>,
+        op: Operator,
+        right: Arc<dyn PhysicalExpr>,
+        eval_mode: EvalMode,
+    ) -> Self {
+        Self {
+            left: Arc::clone(&left),
+            op,
+            right: Arc::clone(&right),
+            eval_mode,
+            inner: Arc::new(BinaryExpr::new(left, op, right)),
+        }
+    }
+
+    fn fail_on_overflow(&self, batch: &RecordBatch, result: &ColumnarValue) -> Result<()> {
+        if self.eval_mode == EvalMode::Ansi && self.op == Operator::Plus {
+            match result.data_type() {
+                DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => {
+                    self.check_int_overflow(batch, result)?
+                }
+                _ => {}
+            }
+        }
+        Ok(())
+    }
+
+    fn check_int_overflow(&self, batch: &RecordBatch, result: &ColumnarValue) -> Result<()> {
+        let check_overflow_expr = Arc::new(BinaryExpr::new(
+            Arc::new(BinaryExpr::new(
+                Arc::new(BinaryExpr::new(
+                    self.left.clone(),
+                    Operator::BitwiseXor,
+                    self.inner.clone(),
+                )),
+                Operator::BitwiseAnd,
+                Arc::new(BinaryExpr::new(
+                    self.right.clone(),
+                    Operator::BitwiseXor,
+                    self.inner.clone(),
+                )),
+            )),
+            Operator::Lt,
+            Self::zero_literal(&result.data_type())?,
+        ));
+        match check_overflow_expr.evaluate(batch)? {
+            ColumnarValue::Array(array) => {
+                let boolean_array = array
+                    .as_any()
+                    .downcast_ref::<BooleanArray>()
+                    .expect("Expected BooleanArray");
+                if boolean_array.true_count() > 0 {
+                    return Err(arithmetic_overflow_error(&result.data_type().to_string()).into());
+                }
+                Ok(())
+            }
+            ColumnarValue::Scalar(ScalarValue::Boolean(Some(true))) => {
+                Err(arithmetic_overflow_error(&result.data_type().to_string()).into())
+            }
+            _ => Ok(()),
+        }
+    }
+
+    fn zero_literal(data_type: &DataType) -> Result<Arc<dyn PhysicalExpr>> {
+        let zero_literal = match data_type {
+            DataType::Int8 => ScalarValue::Int8(Some(0)),
+            DataType::Int16 => ScalarValue::Int16(Some(0)),
+            DataType::Int32 => ScalarValue::Int32(Some(0)),
+            DataType::Int64 => ScalarValue::Int64(Some(0)),
+            _ => {
+                return Err(DataFusionError::Internal(format!(
+                    "Unsupported data type: {:?}",
+                    data_type
+                )))
+            }
+        };
+        Ok(Arc::new(Literal::new(zero_literal)))
+    }
+}
+
+impl Display for CometBinaryExpr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        self.inner.fmt(f)
+    }
+}
+
+impl PhysicalExpr for CometBinaryExpr {
+    fn as_any(&self) -> &dyn Any {
+        self.inner.as_any()
+    }
+
+    fn data_type(&self, input_schema: &Schema) -> Result<DataType> {
+        self.inner.data_type(input_schema)
+    }
+
+    fn nullable(&self, input_schema: &Schema) -> Result<bool> {
+        self.inner.nullable(input_schema)
+    }
+
+    fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
+        match self.inner.evaluate(batch) {
+            Ok(result) => {
+                self.fail_on_overflow(batch, &result)?;
+                Ok(result)
+            }
+            Err(e) => Err(e),
+        }
+    }
+
+    fn evaluate_selection(
+        &self,
+        batch: &RecordBatch,
+        selection: &BooleanArray,
+    ) -> Result<ColumnarValue> {
+        self.inner.evaluate_selection(batch, selection)
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn PhysicalExpr>> {
+        self.inner.children()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn PhysicalExpr>>,
+    ) -> Result<Arc<dyn PhysicalExpr>> {
+        Arc::clone(&self.inner).with_new_children(children)
+    }
+
+    fn evaluate_bounds(&self, children: &[&Interval]) -> Result<Interval> {
+        self.inner.evaluate_bounds(children)
+    }
+
+    fn propagate_constraints(
+        &self,
+        interval: &Interval,
+        children: &[&Interval],
+    ) -> Result<Option<Vec<Interval>>> {
+        self.inner.propagate_constraints(interval, children)
+    }
+
+    fn dyn_hash(&self, state: &mut dyn Hasher) {
+        self.inner.dyn_hash(state)
+    }
+
+    fn get_properties(&self, children: &[ExprProperties]) -> Result<ExprProperties> {
+        self.inner.get_properties(children)
+    }
+}
+
+impl PartialEq<dyn Any> for CometBinaryExpr {
+    fn eq(&self, other: &dyn Any) -> bool {
+        down_cast_any_ref(other)
+            .downcast_ref::<Self>()
+            .map(|x| self.left.eq(&x.left) && self.op == x.op && self.right.eq(&x.right))
+            .unwrap_or(false)
+    }
+}
diff --git a/core/src/execution/datafusion/expressions/mod.rs b/core/src/execution/datafusion/expressions/mod.rs
@@ -30,6 +30,7 @@ use crate::{errors::CometError, execution::spark_expression};
 pub mod abs;
 pub mod avg;
 pub mod avg_decimal;
+pub mod binary;
 pub mod bloom_filter_might_contain;
 pub mod correlation;
 pub mod covariance;

diff --git a/core/src/execution/datafusion/planner.rs b/core/src/execution/datafusion/planner.rs
@@ -97,7 +97,7 @@ use crate::{
     },
 };
 
-use super::expressions::{abs::CometAbsFunc, EvalMode};
+use super::expressions::{abs::CometAbsFunc, binary::CometBinaryExpr, EvalMode};
 
 // For clippy error on type_complexity.
 type ExecResult<T> = Result<T, ExecutionError>;
@@ -160,40 +160,52 @@ impl PhysicalPlanner {
         input_schema: SchemaRef,
     ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
         match spark_expr.expr_struct.as_ref().unwrap() {
-            ExprStruct::Add(expr) => self.create_binary_expr(
-                expr.left.as_ref().unwrap(),
-                expr.right.as_ref().unwrap(),
-                expr.return_type.as_ref(),
-                DataFusionOperator::Plus,
-                input_schema,
-            ),
+            ExprStruct::Add(expr) => {
+                let mode = if expr.fail_on_error {
+                    EvalMode::Ansi
+                } else {
+                    EvalMode::Legacy
+                };
+                self.create_binary_expr(
+                    expr.left.as_ref().unwrap(),
+                    expr.right.as_ref().unwrap(),
+                    expr.return_type.as_ref(),
+                    DataFusionOperator::Plus,
+                    input_schema,
+                    mode,
+                )
+            }
             ExprStruct::Subtract(expr) => self.create_binary_expr(
                 expr.left.as_ref().unwrap(),
                 expr.right.as_ref().unwrap(),
                 expr.return_type.as_ref(),
                 DataFusionOperator::Minus,
                 input_schema,
+                EvalMode::Legacy,
             ),
             ExprStruct::Multiply(expr) => self.create_binary_expr(
                 expr.left.as_ref().unwrap(),
                 expr.right.as_ref().unwrap(),
                 expr.return_type.as_ref(),
                 DataFusionOperator::Multiply,
                 input_schema,
+                EvalMode::Legacy,
             ),
             ExprStruct::Divide(expr) => self.create_binary_expr(
                 expr.left.as_ref().unwrap(),
                 expr.right.as_ref().unwrap(),
                 expr.return_type.as_ref(),
                 DataFusionOperator::Divide,
                 input_schema,
+                EvalMode::Legacy,
             ),
             ExprStruct::Remainder(expr) => self.create_binary_expr(
                 expr.left.as_ref().unwrap(),
                 expr.right.as_ref().unwrap(),
                 expr.return_type.as_ref(),
                 DataFusionOperator::Modulo,
                 input_schema,
+                EvalMode::Legacy,
             ),
             ExprStruct::Eq(expr) => {
                 let left = self.create_expr(expr.left.as_ref().unwrap(), input_schema.clone())?;
@@ -627,6 +639,7 @@ impl PhysicalPlanner {
         return_type: Option<&spark_expression::DataType>,
         op: DataFusionOperator,
         input_schema: SchemaRef,
+        eval_mode: EvalMode,
     ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
         let left = self.create_expr(left, input_schema.clone())?;
         let right = self.create_expr(right, input_schema.clone())?;
@@ -681,7 +694,7 @@ impl PhysicalPlanner {
                     data_type,
                 )))
             }
-            _ => Ok(Arc::new(BinaryExpr::new(left, op, right))),
+            _ => Ok(Arc::new(CometBinaryExpr::new(left, op, right, eval_mode))),
         }
     }