Skip to content

Commit

Permalink
feat: Support NOT and column expressions in eval_sql_where (#653)
Browse files Browse the repository at this point in the history
## What changes are proposed in this pull request?

The first attempt at generalizing SQL was incomplete, supporting only
AND/OR and binary comparisons. We can also support NULL literal
expressions, boolean column expressions and NOT, so add them.

## How was this change tested?

Adjusted existing unit tests for the new support (they previously
expected it to not work)
  • Loading branch information
scovich authored Jan 22, 2025
1 parent a4009a2 commit bf7e212
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 41 deletions.
2 changes: 1 addition & 1 deletion kernel/src/expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ impl Expression {
Self::Literal(value.into())
}

pub fn null_literal(data_type: DataType) -> Self {
pub const fn null_literal(data_type: DataType) -> Self {
Self::Literal(Scalar::Null(data_type))
}

Expand Down
69 changes: 61 additions & 8 deletions kernel/src/predicates/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ pub(crate) trait PredicateEvaluator {
}
}

/// Evaluates a predicate with SQL WHERE semantics.
/// Evaluates a (possibly inverted) predicate with SQL WHERE semantics.
///
/// By default, [`eval_expr`] behaves badly for comparisons involving NULL columns (e.g. `a <
/// 10` when `a` is NULL), because the comparison correctly evaluates to NULL, but NULL
Expand Down Expand Up @@ -351,26 +351,79 @@ pub(crate) trait PredicateEvaluator {
/// AND(..., AND(NULL, TRUE, NULL), ...)
/// AND(..., NULL, ...)
/// ```
fn eval_sql_where(&self, filter: &Expr) -> Option<Self::Output> {
use Expr::{Binary, Variadic};
///
/// WARNING: Not an idempotent transform. If data skipping eval produces a sql predicate,
/// evaluating the result with sql semantics has undefined behavior.
fn eval_expr_sql_where(&self, filter: &Expr, inverted: bool) -> Option<Self::Output> {
use Expr::*;
match filter {
Variadic(v) => {
// Recursively invoke `eval_sql_where` instead of the usual `eval_expr` for AND/OR.
let exprs = v.exprs.iter().map(|expr| self.eval_sql_where(expr));
self.finish_eval_variadic(v.op, exprs, false)
// Recursively invoke `eval_expr_sql_where` instead of the usual `eval_expr` for AND/OR.
let exprs = v
.exprs
.iter()
.map(|expr| self.eval_expr_sql_where(expr, inverted));
self.finish_eval_variadic(v.op, exprs, inverted)
}
Binary(BinaryExpression { op, left, right }) if op.is_null_intolerant_comparison() => {
// Perform a nullsafe comparison instead of the usual `eval_binary`
let exprs = [
self.eval_unary(UnaryOperator::IsNull, left, true),
self.eval_unary(UnaryOperator::IsNull, right, true),
self.eval_binary(*op, left, right, false),
self.eval_binary(*op, left, right, inverted),
];
self.finish_eval_variadic(VariadicOperator::And, exprs, false)
}
Unary(UnaryExpression {
op: UnaryOperator::Not,
expr,
}) => self.eval_expr_sql_where(expr, !inverted),
Column(col) => {
// Perform a nullsafe comparison instead of the usual `eval_column`
let exprs = [
self.eval_unary(UnaryOperator::IsNull, filter, true),
self.eval_column(col, inverted),
];
self.finish_eval_variadic(VariadicOperator::And, exprs, false)
}
_ => self.eval_expr(filter, false),
Literal(val) if val.is_null() => {
// AND(NULL IS NOT NULL, NULL) = AND(FALSE, NULL) = FALSE
self.eval_scalar(&Scalar::from(false), false)
}
// Process all remaining expressions normally, because they are not proven safe. Indeed,
// expressions like DISTINCT and IS [NOT] NULL are known-unsafe under SQL semantics:
//
// ```
// x IS NULL # when x really is NULL
// = AND(x IS NOT NULL, x IS NULL)
// = AND(FALSE, TRUE)
// = FALSE
//
// DISTINCT(x, 10) # when x is NULL
// = AND(x IS NOT NULL, 10 IS NOT NULL, DISTINCT(x, 10))
// = AND(FALSE, TRUE, TRUE)
// = FALSE
//
// DISTINCT(x, NULL) # when x is not NULL
// = AND(x IS NOT NULL, NULL IS NOT NULL, DISTINCT(x, NULL))
// = AND(TRUE, FALSE, TRUE)
// = FALSE
// ```
//
_ => self.eval_expr(filter, inverted),
}
}

/// A convenient non-inverted wrapper for [`eval_expr`]
#[cfg(test)]
fn eval(&self, expr: &Expr) -> Option<Self::Output> {
self.eval_expr(expr, false)
}

/// A convenient non-inverted wrapper for [`eval_expr_sql_where`].
fn eval_sql_where(&self, expr: &Expr) -> Option<Self::Output> {
self.eval_expr_sql_where(expr, false)
}
}

/// A collection of provided methods from the [`PredicateEvaluator`] trait, factored out to allow
Expand Down
12 changes: 2 additions & 10 deletions kernel/src/predicates/parquet_stats_skipping/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,7 @@ fn test_eval_binary_comparisons() {
let do_test = |min: Scalar, max: Scalar, expected: &[Option<bool>]| {
let filter = MinMaxTestFilter::new(Some(min.clone()), Some(max.clone()));
for (expr, expect) in expressions.iter().zip(expected.iter()) {
expect_eq!(
filter.eval_expr(expr, false),
*expect,
"{expr:#?} with [{min}..{max}]"
);
expect_eq!(filter.eval(expr), *expect, "{expr:#?} with [{min}..{max}]");
}
};

Expand Down Expand Up @@ -240,11 +236,7 @@ fn test_eval_is_null() {
let do_test = |nullcount: i64, expected: &[Option<bool>]| {
let filter = NullCountTestFilter::new(Some(nullcount), 2);
for (expr, expect) in expressions.iter().zip(expected) {
expect_eq!(
filter.eval_expr(expr, false),
*expect,
"{expr:#?} ({nullcount} nulls)"
);
expect_eq!(filter.eval(expr), *expect, "{expr:#?} ({nullcount} nulls)");
}
};

Expand Down
40 changes: 24 additions & 16 deletions kernel/src/predicates/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -588,62 +588,70 @@ fn test_sql_where() {
let null_filter = DefaultPredicateEvaluator::from(NullColumnResolver);
let empty_filter = DefaultPredicateEvaluator::from(EmptyColumnResolver);

// Basic sanity checks
// Basic sanity check
expect_eq!(null_filter.eval_sql_where(&VAL), None, "WHERE {VAL}");
expect_eq!(null_filter.eval_sql_where(col), None, "WHERE {col}");
expect_eq!(empty_filter.eval_sql_where(&VAL), None, "WHERE {VAL}");

expect_eq!(null_filter.eval_sql_where(col), Some(false), "WHERE {col}");
expect_eq!(empty_filter.eval_sql_where(col), None, "WHERE {col}");

// SQL eval does not modify behavior of IS NULL
let expr = &Expr::is_null(col.clone());
expect_eq!(null_filter.eval_sql_where(expr), Some(true), "{expr}");

// NOT a gets skipped when NULL but not when missing
let expr = &!col.clone();
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");

// Injected NULL checks only short circuit if inputs are NULL
let expr = &Expr::lt(FALSE, TRUE);
expect_eq!(null_filter.eval_sql_where(expr), Some(true), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), Some(true), "{expr}");

// Constrast normal vs SQL WHERE semantics - comparison
let expr = &Expr::lt(col.clone(), VAL);
expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}");
expect_eq!(null_filter.eval(expr), None, "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
// NULL check produces NULL due to missing column
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");

let expr = &Expr::lt(VAL, col.clone());
expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}");
expect_eq!(null_filter.eval(expr), None, "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");

let expr = &Expr::distinct(VAL, col.clone());
expect_eq!(null_filter.eval_expr(expr, false), Some(true), "{expr}");
expect_eq!(null_filter.eval(expr), Some(true), "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(true), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");

let expr = &Expr::distinct(NULL, col.clone());
expect_eq!(null_filter.eval_expr(expr, false), Some(false), "{expr}");
expect_eq!(null_filter.eval(expr), Some(false), "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");

// Constrast normal vs SQL WHERE semantics - comparison inside AND
let expr = &Expr::and(NULL, Expr::lt(col.clone(), VAL));
expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}");
let expr = &Expr::and(TRUE, Expr::lt(col.clone(), VAL));
expect_eq!(null_filter.eval(expr), None, "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");

// NULL/FALSE (from the NULL check) is stronger than TRUE
let expr = &Expr::and(TRUE, Expr::lt(col.clone(), VAL));
expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}");
// NULL allows static skipping under SQL semantics
let expr = &Expr::and(NULL, Expr::lt(col.clone(), VAL));
expect_eq!(null_filter.eval(expr), None, "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), Some(false), "{expr}");

// Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND
let expr = &Expr::and(TRUE, Expr::and(NULL, Expr::lt(col.clone(), VAL)));
expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}");
let expr = &Expr::and(TRUE, Expr::and(TRUE, Expr::lt(col.clone(), VAL)));
expect_eq!(null_filter.eval(expr), None, "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");

// Ditto for comparison inside OR inside AND
let expr = &Expr::or(FALSE, Expr::and(NULL, Expr::lt(col.clone(), VAL)));
expect_eq!(null_filter.eval_expr(expr, false), None, "{expr}");
let expr = &Expr::or(FALSE, Expr::and(TRUE, Expr::lt(col.clone(), VAL)));
expect_eq!(null_filter.eval(expr), None, "{expr}");
expect_eq!(null_filter.eval_sql_where(expr), Some(false), "{expr}");
expect_eq!(empty_filter.eval_sql_where(expr), None, "{expr}");
}
2 changes: 1 addition & 1 deletion kernel/src/scan/data_skipping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ mod tests;
/// expression is dropped.
#[cfg(test)]
fn as_data_skipping_predicate(expr: &Expr) -> Option<Expr> {
DataSkippingPredicateCreator.eval_expr(expr, false)
DataSkippingPredicateCreator.eval(expr)
}

/// Like `as_data_skipping_predicate`, but invokes [`PredicateEvaluator::eval_sql_where`] instead
Expand Down
9 changes: 5 additions & 4 deletions kernel/src/scan/data_skipping/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -319,16 +319,17 @@ fn test_sql_where() {
do_test(ALL_NULL, expr, MISSING, None, None);

// Comparison inside AND works
let expr = &Expr::and(NULL, Expr::lt(col.clone(), VAL));
let expr = &Expr::and(TRUE, Expr::lt(VAL, col.clone()));
do_test(ALL_NULL, expr, PRESENT, None, Some(false));
do_test(ALL_NULL, expr, MISSING, None, None);

let expr = &Expr::and(TRUE, Expr::lt(VAL, col.clone()));
// NULL inside AND allows static skipping under SQL semantics
let expr = &Expr::and(NULL, Expr::lt(col.clone(), VAL));
do_test(ALL_NULL, expr, PRESENT, None, Some(false));
do_test(ALL_NULL, expr, MISSING, None, None);
do_test(ALL_NULL, expr, MISSING, None, Some(false));

// Comparison inside AND inside AND works
let expr = &Expr::and(TRUE, Expr::and(NULL, Expr::lt(col.clone(), VAL)));
let expr = &Expr::and(TRUE, Expr::and(TRUE, Expr::lt(col.clone(), VAL)));
do_test(ALL_NULL, expr, PRESENT, None, Some(false));
do_test(ALL_NULL, expr, MISSING, None, None);

Expand Down
4 changes: 3 additions & 1 deletion kernel/src/scan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -776,17 +776,19 @@ mod tests {

#[test]
fn test_static_skipping() {
const NULL: Expression = Expression::null_literal(DataType::BOOLEAN);
let test_cases = [
(false, column_expr!("a")),
(true, Expression::literal(false)),
(false, Expression::literal(true)),
(false, Expression::null_literal(DataType::LONG)),
(true, NULL),
(true, Expression::and(column_expr!("a"), false)),
(false, Expression::or(column_expr!("a"), true)),
(false, Expression::or(column_expr!("a"), false)),
(false, Expression::lt(column_expr!("a"), 10)),
(false, Expression::lt(Expression::literal(10), 100)),
(true, Expression::gt(Expression::literal(10), 100)),
(true, Expression::and(NULL, column_expr!("a"))),
];
for (should_skip, predicate) in test_cases {
assert_eq!(
Expand Down

0 comments on commit bf7e212

Please sign in to comment.