From 3e8c22b71bb4f8d44525a345bd89dab06aa41175 Mon Sep 17 00:00:00 2001 From: my-vegetable-has-exploded Date: Tue, 26 Dec 2023 17:02:22 +0800 Subject: [PATCH 1/7] support inlist in LiteralGuarantee for pruning. --- .../physical-expr/src/utils/guarantee.rs | 154 ++++++++++++------ 1 file changed, 100 insertions(+), 54 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 59ec255754c0..937e66e6dfc7 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -77,7 +77,7 @@ pub struct LiteralGuarantee { } /// What is guaranteed about the values for a [`LiteralGuarantee`]? -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Guarantee { /// Guarantee that the expression is `true` if `column` is one of the values. If /// `column` is not one of the values, the expression can not be `true`. @@ -94,15 +94,9 @@ impl LiteralGuarantee { /// create these structures from an predicate (boolean expression). fn try_new<'a>( column_name: impl Into, - op: Operator, + guarantee: Guarantee, literals: impl IntoIterator, ) -> Option { - let guarantee = match op { - Operator::Eq => Guarantee::In, - Operator::NotEq => Guarantee::NotIn, - _ => return None, - }; - let literals: HashSet<_> = literals.into_iter().cloned().collect(); Some(Self { @@ -112,6 +106,7 @@ impl LiteralGuarantee { }) } + //Note@wy build literal guarantee from expr /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr` /// to evaluate to `true`. /// @@ -120,7 +115,7 @@ impl LiteralGuarantee { /// expression is guaranteed to be `null` or `false`. /// /// # Notes: - /// 1. `expr` must be a boolean expression. + /// 1. `expr` must be a boolean expression or inlist expression. /// 2. `expr` is not simplified prior to analysis. pub fn analyze(expr: &Arc) -> Vec { // split conjunction: AND AND ... @@ -130,6 +125,39 @@ impl LiteralGuarantee { .fold(GuaranteeBuilder::new(), |builder, expr| { if let Some(cel) = ColOpLit::try_new(expr) { return builder.aggregate_conjunct(cel); + } else if let Some(inlist) = expr + .as_any() + .downcast_ref::() + { + //Only support single-column inlist currently, multi-column inlist is not supported + let col = inlist + .expr() + .as_any() + .downcast_ref::(); + if col.is_none() { + return builder; + } + + let literals = inlist + .list() + .iter() + .map(|e| e.as_any().downcast_ref::()) + .collect::>>(); + if literals.is_none() { + return builder; + } + + let guarantee = if inlist.negated() { + Guarantee::NotIn + } else { + Guarantee::In + }; + + builder.aggregate_multi_conjunct( + col.unwrap(), + guarantee, + literals.unwrap().iter().map(|e| e.value()), + ) } else { // split disjunction: OR OR ... let disjunctions = split_disjunction(expr); @@ -168,14 +196,21 @@ impl LiteralGuarantee { // if all terms are 'col literal' with the same column // and operation we can infer any guarantees + // + // For those like (a != bar OR a != baz). + // We can't combine the (a != bar OR a != baz) part, but + // it also doesn't invalidate our knowledge that a != + // foo is required for the expression to be true. + // So we can only create a multi guarantee for `=` + // (or a single value). (e.g. ignore `a != foo OR a != bar`) let first_term = &terms[0]; if terms.iter().all(|term| { term.col.name() == first_term.col.name() - && term.op == first_term.op + && term.guarantee == Guarantee::In }) { builder.aggregate_multi_conjunct( first_term.col, - first_term.op, + Guarantee::In, terms.iter().map(|term| term.lit.value()), ) } else { @@ -197,9 +232,9 @@ struct GuaranteeBuilder<'a> { /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None guarantees: Vec>, - /// Key is the (column name, operator type) + /// Key is the (column name, guarantee type) /// Value is the index into `guarantees` - map: HashMap<(&'a crate::expressions::Column, Operator), usize>, + map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>, } impl<'a> GuaranteeBuilder<'a> { @@ -216,7 +251,7 @@ impl<'a> GuaranteeBuilder<'a> { fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self { self.aggregate_multi_conjunct( col_op_lit.col, - col_op_lit.op, + col_op_lit.guarantee, [col_op_lit.lit.value()], ) } @@ -233,10 +268,10 @@ impl<'a> GuaranteeBuilder<'a> { fn aggregate_multi_conjunct( mut self, col: &'a crate::expressions::Column, - op: Operator, + guarantee: Guarantee, new_values: impl IntoIterator, ) -> Self { - let key = (col, op); + let key = (col, guarantee); if let Some(index) = self.map.get(&key) { // already have a guarantee for this column let entry = &mut self.guarantees[*index]; @@ -257,26 +292,20 @@ impl<'a> GuaranteeBuilder<'a> { // another `AND a != 6` we know that a must not be either 5 or 6 // for the expression to be true Guarantee::NotIn => { - // can extend if only single literal, otherwise invalidate let new_values: HashSet<_> = new_values.into_iter().collect(); - if new_values.len() == 1 { - existing.literals.extend(new_values.into_iter().cloned()) - } else { - // this is like (a != foo AND (a != bar OR a != baz)). - // We can't combine the (a != bar OR a != baz) part, but - // it also doesn't invalidate our knowledge that a != - // foo is required for the expression to be true - } + existing.literals.extend(new_values.into_iter().cloned()); } Guarantee::In => { - // for an IN guarantee, it is ok if the value is the same - // e.g. `a = foo AND a = foo` but not if the value is different - // e.g. `a = foo AND a = bar` - if new_values + let intersection = new_values .into_iter() - .all(|new_value| existing.literals.contains(new_value)) - { - // all values are already in the set + .filter(|new_value| existing.literals.contains(*new_value)) + .collect::>(); + // for an In guarantee, if the intersection is not empty, we can extend the guarantee + // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)` + // otherwise, we invalidate the guarantee + // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()` , which is invalid + if !intersection.is_empty() { + existing.literals = intersection.into_iter().cloned().collect(); } else { // at least one was not, so invalidate the guarantee *entry = None; @@ -287,17 +316,12 @@ impl<'a> GuaranteeBuilder<'a> { // This is a new guarantee let new_values: HashSet<_> = new_values.into_iter().collect(); - // new_values are combined with OR, so we can only create a - // multi-column guarantee for `=` (or a single value). - // (e.g. ignore `a != foo OR a != bar`) - if op == Operator::Eq || new_values.len() == 1 { - if let Some(guarantee) = - LiteralGuarantee::try_new(col.name(), op, new_values) - { - // add it to the list of guarantees - self.guarantees.push(Some(guarantee)); - self.map.insert(key, self.guarantees.len() - 1); - } + if let Some(guarantee) = + LiteralGuarantee::try_new(col.name(), guarantee, new_values) + { + // add it to the list of guarantees + self.guarantees.push(Some(guarantee)); + self.map.insert(key, self.guarantees.len() - 1); } } @@ -311,10 +335,10 @@ impl<'a> GuaranteeBuilder<'a> { } } -/// Represents a single `col literal` expression +/// Represents a single `col [not]in literal` expression struct ColOpLit<'a> { col: &'a crate::expressions::Column, - op: Operator, + guarantee: Guarantee, lit: &'a crate::expressions::Literal, } @@ -322,7 +346,7 @@ impl<'a> ColOpLit<'a> { /// Returns Some(ColEqLit) if the expression is either: /// 1. `col literal` /// 2. `literal col` - /// + /// 3. operator is `=` or `!=` /// Returns None otherwise fn try_new(expr: &'a Arc) -> Option { let binary_expr = expr @@ -334,13 +358,21 @@ impl<'a> ColOpLit<'a> { binary_expr.op(), binary_expr.right().as_any(), ); - + let guarantee = match op { + Operator::Eq => Guarantee::In, + Operator::NotEq => Guarantee::NotIn, + _ => return None, + }; // col literal if let (Some(col), Some(lit)) = ( left.downcast_ref::(), right.downcast_ref::(), ) { - Some(Self { col, op: *op, lit }) + Some(Self { + col, + guarantee, + lit, + }) } // literal col else if let (Some(lit), Some(col)) = ( @@ -348,7 +380,11 @@ impl<'a> ColOpLit<'a> { right.downcast_ref::(), ) { // Used swapped operator operator, if possible - op.swap().map(|op| Self { col, op, lit }) + Some(Self { + col, + guarantee, + lit, + }) } else { None } @@ -645,9 +681,19 @@ mod test { ); } - // TODO https://github.com/apache/arrow-datafusion/issues/8436 - // a IN (...) - // b NOT IN (...) + #[test] + fn test_single_inlist() { + // a IN (1, 2, 3) + test_analyze( + col("a").in_list(vec![lit(1), lit(2), lit(3)], true), + vec![in_guarantee("a", [1, 2, 3])], + ); + // a NOT IN (1, 2, 3) + test_analyze( + col("a").in_list(vec![lit(1), lit(2), lit(3)], false), + vec![not_in_guarantee("a", [1, 2, 3])], + ); + } /// Tests that analyzing expr results in the expected guarantees fn test_analyze(expr: Expr, expected: Vec) { @@ -673,7 +719,7 @@ mod test { S: Into + 'a, { let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); - LiteralGuarantee::try_new(column, Operator::Eq, literals.iter()).unwrap() + LiteralGuarantee::try_new(column, Guarantee::In, literals.iter()).unwrap() } /// Guarantee that the expression is true if the column is NOT any of the specified values @@ -683,7 +729,7 @@ mod test { S: Into + 'a, { let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); - LiteralGuarantee::try_new(column, Operator::NotEq, literals.iter()).unwrap() + LiteralGuarantee::try_new(column, Guarantee::NotIn, literals.iter()).unwrap() } /// Convert a logical expression to a physical expression (without any simplification, etc) From 0c403d976689c2653f919e7de4695b0327152cb8 Mon Sep 17 00:00:00 2001 From: my-vegetable-has-exploded Date: Tue, 26 Dec 2023 17:28:51 +0800 Subject: [PATCH 2/7] add more tests --- .../physical-expr/src/utils/guarantee.rs | 100 ++++++++++++++++-- 1 file changed, 94 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 937e66e6dfc7..87abfa7898f8 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -683,15 +683,103 @@ mod test { #[test] fn test_single_inlist() { - // a IN (1, 2, 3) + // b IN (1, 2, 3) test_analyze( - col("a").in_list(vec![lit(1), lit(2), lit(3)], true), - vec![in_guarantee("a", [1, 2, 3])], + col("b").in_list(vec![lit(1), lit(2), lit(3)], false), + vec![in_guarantee("b", [1, 2, 3])], + ); + // b NOT IN (1, 2, 3) + test_analyze( + col("b").in_list(vec![lit(1), lit(2), lit(3)], true), + vec![not_in_guarantee("b", [1, 2, 3])], + ); + } + + #[test] + fn test_inlist_conjunction() { + // b IN (1, 2, 3) AND b IN (2, 3, 4) + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)), + vec![in_guarantee("b", [2, 3])], + ); + // b NOT IN (1, 2, 3) AND b IN (2, 3, 4) + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], true) + .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)), + vec![ + not_in_guarantee("b", [1, 2, 3]), + in_guarantee("b", [2, 3, 4]), + ], + ); + // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4) + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], true) + .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)), + vec![not_in_guarantee("b", [1, 2, 3, 4])], + ); + // b IN (1, 2, 3) AND b = 4 + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .and(col("b").eq(lit(4))), + vec![], + ); + // b IN (1, 2, 3) AND b = 2 + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .and(col("b").eq(lit(2))), + vec![in_guarantee("b", [2])], + ); + // b IN (1, 2, 3) AND b != 2 + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .and(col("b").not_eq(lit(2))), + vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])], + ); + // b NOT IN (1, 2, 3) AND b != 4 + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], true) + .and(col("b").not_eq(lit(4))), + vec![not_in_guarantee("b", [1, 2, 3, 4])], ); - // a NOT IN (1, 2, 3) + // b NOT IN (1, 2, 3) AND b != 2 test_analyze( - col("a").in_list(vec![lit(1), lit(2), lit(3)], false), - vec![not_in_guarantee("a", [1, 2, 3])], + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], true) + .and(col("b").not_eq(lit(2))), + vec![not_in_guarantee("b", [1, 2, 3])], + ); + } + + #[test] + fn test_inlist_with_disjunction() { + // b IN (1, 2, 3) AND (b = 3 OR b = 4) + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))), + vec![in_guarantee("b", [3])], + ); + // b IN (1, 2, 3) AND (b = 4 OR b = 5) + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))), + vec![], + ); + // b not in (1, 2, 3) AND (b = 3 OR b = 4) + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], true) + .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))), + vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])], ); } From d37574eee45abd23b45622b4db0601d668775426 Mon Sep 17 00:00:00 2001 From: my-vegetable-has-exploded Date: Tue, 26 Dec 2023 18:32:04 +0800 Subject: [PATCH 3/7] rm useless notes --- datafusion/physical-expr/src/utils/guarantee.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 87abfa7898f8..7c26141760be 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -106,7 +106,6 @@ impl LiteralGuarantee { }) } - //Note@wy build literal guarantee from expr /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr` /// to evaluate to `true`. /// From c0b839cd635f577ebcc8ccc2b0eeb52dcf316d1e Mon Sep 17 00:00:00 2001 From: yi wang <48236141+my-vegetable-has-exploded@users.noreply.github.com> Date: Wed, 27 Dec 2023 15:36:28 +0800 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Huaijin --- datafusion/physical-expr/src/utils/guarantee.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 7c26141760be..cc7475f47b4a 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -196,11 +196,11 @@ impl LiteralGuarantee { // if all terms are 'col literal' with the same column // and operation we can infer any guarantees // - // For those like (a != bar OR a != baz). + // For those like (a != foo AND (a != bar OR a != baz)). // We can't combine the (a != bar OR a != baz) part, but // it also doesn't invalidate our knowledge that a != // foo is required for the expression to be true. - // So we can only create a multi guarantee for `=` + // So we can only create a multi value guarantee for `=` // (or a single value). (e.g. ignore `a != foo OR a != bar`) let first_term = &terms[0]; if terms.iter().all(|term| { @@ -773,7 +773,7 @@ mod test { .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))), vec![], ); - // b not in (1, 2, 3) AND (b = 3 OR b = 4) + // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4) test_analyze( col("b") .in_list(vec![lit(1), lit(2), lit(3)], true) From e18ebbaf3fba09bb41aa11fdac43bf280f92e87f Mon Sep 17 00:00:00 2001 From: my-vegetable-has-exploded Date: Wed, 27 Dec 2023 16:55:31 +0800 Subject: [PATCH 5/7] add tests in row_groups --- .../physical_plan/parquet/row_groups.rs | 121 ++---------------- 1 file changed, 14 insertions(+), 107 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs index 8a1abb7d965f..5d18eac7d9fb 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs @@ -293,15 +293,10 @@ mod tests { use arrow::datatypes::DataType::Decimal128; use arrow::datatypes::Schema; use arrow::datatypes::{DataType, Field}; - use datafusion_common::{config::ConfigOptions, TableReference, ToDFSchema}; - use datafusion_common::{DataFusionError, Result}; - use datafusion_expr::{ - builder::LogicalTableSource, cast, col, lit, AggregateUDF, Expr, ScalarUDF, - TableSource, WindowUDF, - }; + use datafusion_common::{Result, ToDFSchema}; + use datafusion_expr::{cast, col, lit, Expr}; use datafusion_physical_expr::execution_props::ExecutionProps; use datafusion_physical_expr::{create_physical_expr, PhysicalExpr}; - use datafusion_sql::planner::ContextProvider; use parquet::arrow::arrow_to_parquet_schema; use parquet::arrow::async_reader::ParquetObjectReader; use parquet::basic::LogicalType; @@ -1105,13 +1100,18 @@ mod tests { let data = bytes::Bytes::from(std::fs::read(path).unwrap()); // generate pruning predicate - let schema = Schema::new(vec![ - Field::new("String", DataType::Utf8, false), - Field::new("String3", DataType::Utf8, false), - ]); - let sql = - "SELECT * FROM tbl WHERE \"String\" IN ('Hello_Not_Exists', 'Hello_Not_Exists2')"; - let expr = sql_to_physical_plan(sql).unwrap(); + let schema = Schema::new(vec![Field::new("String", DataType::Utf8, false)]); + + let expr = col(r#""String""#).in_list( + vec![ + lit("Hello_Not_Exists"), + lit("Hello_Not_Exists2"), + lit("Hello_Not_Exists3"), + lit("Hello_Not_Exist4"), + ], + false, + ); + let expr = logical2physical(&expr, &schema); let pruning_predicate = PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); @@ -1312,97 +1312,4 @@ mod tests { Ok(pruned_row_group) } - - fn sql_to_physical_plan(sql: &str) -> Result> { - use datafusion_optimizer::{ - analyzer::Analyzer, optimizer::Optimizer, OptimizerConfig, OptimizerContext, - }; - use datafusion_sql::{ - planner::SqlToRel, - sqlparser::{ast::Statement, parser::Parser}, - }; - use sqlparser::dialect::GenericDialect; - - // parse the SQL - let dialect = GenericDialect {}; // or AnsiDialect, or your own dialect ... - let ast: Vec = Parser::parse_sql(&dialect, sql).unwrap(); - let statement = &ast[0]; - - // create a logical query plan - let schema_provider = TestSchemaProvider::new(); - let sql_to_rel = SqlToRel::new(&schema_provider); - let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap(); - - // hard code the return value of now() - let config = OptimizerContext::new().with_skip_failing_rules(false); - let analyzer = Analyzer::new(); - let optimizer = Optimizer::new(); - // analyze and optimize the logical plan - let plan = analyzer.execute_and_check(&plan, config.options(), |_, _| {})?; - let plan = optimizer.optimize(&plan, &config, |_, _| {})?; - // convert the logical plan into a physical plan - let exprs = plan.expressions(); - let expr = &exprs[0]; - let df_schema = plan.schema().as_ref().to_owned(); - let tb_schema: Schema = df_schema.clone().into(); - let execution_props = ExecutionProps::new(); - create_physical_expr(expr, &df_schema, &tb_schema, &execution_props) - } - - struct TestSchemaProvider { - options: ConfigOptions, - tables: HashMap>, - } - - impl TestSchemaProvider { - pub fn new() -> Self { - let mut tables = HashMap::new(); - tables.insert( - "tbl".to_string(), - create_table_source(vec![Field::new( - "String".to_string(), - DataType::Utf8, - false, - )]), - ); - - Self { - options: Default::default(), - tables, - } - } - } - - impl ContextProvider for TestSchemaProvider { - fn get_table_source(&self, name: TableReference) -> Result> { - match self.tables.get(name.table()) { - Some(table) => Ok(table.clone()), - _ => datafusion_common::plan_err!("Table not found: {}", name.table()), - } - } - - fn get_function_meta(&self, _name: &str) -> Option> { - None - } - - fn get_aggregate_meta(&self, _name: &str) -> Option> { - None - } - - fn get_variable_type(&self, _variable_names: &[String]) -> Option { - None - } - - fn options(&self) -> &ConfigOptions { - &self.options - } - - fn get_window_meta(&self, _name: &str) -> Option> { - None - } - } - - fn create_table_source(fields: Vec) -> Arc { - Arc::new(LogicalTableSource::new(Arc::new(Schema::new(fields)))) - } } From 0e14604c532fa42d5a2ab82ecf1363f317a30ba4 Mon Sep 17 00:00:00 2001 From: yi wang <48236141+my-vegetable-has-exploded@users.noreply.github.com> Date: Thu, 28 Dec 2023 13:12:44 +0800 Subject: [PATCH 6/7] Apply suggestions from code review Co-authored-by: Ruihang Xia Co-authored-by: Andrew Lamb --- datafusion/physical-expr/src/utils/guarantee.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index cc7475f47b4a..d79673a3ba20 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -128,12 +128,12 @@ impl LiteralGuarantee { .as_any() .downcast_ref::() { - //Only support single-column inlist currently, multi-column inlist is not supported + // Only support single-column inlist currently, multi-column inlist is not supported let col = inlist .expr() .as_any() .downcast_ref::(); - if col.is_none() { + let Some(col) = col else { return builder; } @@ -302,7 +302,7 @@ impl<'a> GuaranteeBuilder<'a> { // for an In guarantee, if the intersection is not empty, we can extend the guarantee // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)` // otherwise, we invalidate the guarantee - // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()` , which is invalid + // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid if !intersection.is_empty() { existing.literals = intersection.into_iter().cloned().collect(); } else { From d5a18c27ecd41e580d8fb021a824bfa49f587f6b Mon Sep 17 00:00:00 2001 From: my-vegetable-has-exploded Date: Thu, 28 Dec 2023 13:31:25 +0800 Subject: [PATCH 7/7] update comment & add more tests --- .../physical-expr/src/utils/guarantee.rs | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index d79673a3ba20..0aee2af67fdd 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -135,16 +135,16 @@ impl LiteralGuarantee { .downcast_ref::(); let Some(col) = col else { return builder; - } + }; let literals = inlist .list() .iter() .map(|e| e.as_any().downcast_ref::()) .collect::>>(); - if literals.is_none() { + let Some(literals) = literals else { return builder; - } + }; let guarantee = if inlist.negated() { Guarantee::NotIn @@ -153,9 +153,9 @@ impl LiteralGuarantee { }; builder.aggregate_multi_conjunct( - col.unwrap(), + col, guarantee, - literals.unwrap().iter().map(|e| e.value()), + literals.iter().map(|e| e.value()), ) } else { // split disjunction: OR OR ... @@ -378,7 +378,6 @@ impl<'a> ColOpLit<'a> { left.downcast_ref::(), right.downcast_ref::(), ) { - // Used swapped operator operator, if possible Some(Self { col, guarantee, @@ -780,6 +779,21 @@ mod test { .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))), vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])], ); + // b IN (1, 2, 3) OR b = 2 + // TODO this should be in_guarantee("b", [1, 2, 3]) but currently we don't support to anylize this kind of disjunction. Only `ColOpLit OR ColOpLit` is supported. + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .or(col("b").eq(lit(2))), + vec![], + ); + // b IN (1, 2, 3) OR b != 3 + test_analyze( + col("b") + .in_list(vec![lit(1), lit(2), lit(3)], false) + .or(col("b").not_eq(lit(3))), + vec![], + ); } /// Tests that analyzing expr results in the expected guarantees