From 93ebc8ff4d1cb114eb273ad137fde707f9a185aa Mon Sep 17 00:00:00 2001 From: hhj Date: Thu, 30 Nov 2023 16:33:18 +0800 Subject: [PATCH 1/4] fix: make ntile work in some corner cases --- datafusion/expr/src/window_function.rs | 15 +- datafusion/physical-expr/src/window/ntile.rs | 6 +- datafusion/physical-plan/src/windows/mod.rs | 29 +++- datafusion/sqllogictest/test_files/window.slt | 162 ++++++++++++++++++ 4 files changed, 201 insertions(+), 11 deletions(-) diff --git a/datafusion/expr/src/window_function.rs b/datafusion/expr/src/window_function.rs index 946a80dd844a..610f1ecaeae9 100644 --- a/datafusion/expr/src/window_function.rs +++ b/datafusion/expr/src/window_function.rs @@ -268,7 +268,20 @@ impl BuiltInWindowFunction { BuiltInWindowFunction::FirstValue | BuiltInWindowFunction::LastValue => { Signature::any(1, Volatility::Immutable) } - BuiltInWindowFunction::Ntile => Signature::any(1, Volatility::Immutable), + BuiltInWindowFunction::Ntile => Signature::uniform( + 1, + vec![ + DataType::UInt64, + DataType::UInt32, + DataType::UInt16, + DataType::UInt8, + DataType::Int64, + DataType::Int32, + DataType::Int16, + DataType::Int8, + ], + Volatility::Immutable, + ), BuiltInWindowFunction::NthValue => Signature::any(2, Volatility::Immutable), } } diff --git a/datafusion/physical-expr/src/window/ntile.rs b/datafusion/physical-expr/src/window/ntile.rs index 49aac0877ab3..0998fc77a2ea 100644 --- a/datafusion/physical-expr/src/window/ntile.rs +++ b/datafusion/physical-expr/src/window/ntile.rs @@ -96,8 +96,12 @@ impl PartitionEvaluator for NtileEvaluator { ) -> Result { let num_rows = num_rows as u64; let mut vec: Vec = Vec::new(); + let mut n = self.n; + if n > num_rows { + n = num_rows; + } for i in 0..num_rows { - let res = i * self.n / num_rows; + let res = i * n / num_rows; vec.push(res + 1) } Ok(Arc::new(UInt64Array::from(vec))) diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index d97e3c93a136..11af764839a8 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -189,15 +189,26 @@ fn create_built_in_window_expr( BuiltInWindowFunction::PercentRank => Arc::new(percent_rank(name)), BuiltInWindowFunction::CumeDist => Arc::new(cume_dist(name)), BuiltInWindowFunction::Ntile => { - let n: i64 = get_scalar_value_from_args(args, 0)? - .ok_or_else(|| { - DataFusionError::Execution( - "NTILE requires at least 1 argument".to_string(), - ) - })? - .try_into()?; - let n: u64 = n as u64; - Arc::new(Ntile::new(name, n)) + let n = get_scalar_value_from_args(args, 0)?.ok_or_else(|| { + DataFusionError::Execution( + "NTILE requires at least 1 argument".to_string(), + ) + })?; + + if n.is_null() { + return exec_err!("NTILE requires a positive integer, but finds NULL"); + } + + if n.is_unsigned() { + let n: u64 = n.try_into()?; + Arc::new(Ntile::new(name, n)) + } else { + let n: i64 = n.try_into()?; + if n <= 0 { + return exec_err!("NTILE requires a positive integer"); + } + Arc::new(Ntile::new(name, n as u64)) + } } BuiltInWindowFunction::Lag => { let arg = args[0].clone(); diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index b2491478d84e..b8f5f76f57ad 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -3581,3 +3581,165 @@ CREATE TABLE new_table AS SELECT NTILE(2) OVER(ORDER BY c1) AS ntile_2 FROM aggr statement ok DROP TABLE new_table; + +statement ok +CREATE TABLE t1 (a int) AS VALUES (1), (2), (3); + +query I +SELECT NTILE(9223377) OVER(ORDER BY a) FROM t1; +---- +1 +2 +3 + +query I +SELECT NTILE(9223372036854775809) OVER(ORDER BY a) FROM t1; +---- +1 +2 +3 + +query error DataFusion error: Execution error: NTILE requires a positive integer +SELECT NTILE(-922337203685477580) OVER(ORDER BY a) FROM t1; + +query error DataFusion error: Execution error: Table 't' doesn't exist\. +DROP TABLE t; + +# NTILE with PARTITION BY, those tests from duckdb: https://github.com/duckdb/duckdb/blob/main/test/sql/window/test_ntile.test +statement ok +CREATE TABLE Scoreboard(TeamName VARCHAR, Player VARCHAR, Score INTEGER); + +statement ok +INSERT INTO Scoreboard VALUES ('Mongrels', 'Apu', 350); + +statement ok +INSERT INTO Scoreboard VALUES ('Mongrels', 'Ned', 666); + +statement ok +INSERT INTO Scoreboard VALUES ('Mongrels', 'Meg', 1030); + +statement ok +INSERT INTO Scoreboard VALUES ('Mongrels', 'Burns', 1270); + +statement ok +INSERT INTO Scoreboard VALUES ('Simpsons', 'Homer', 1); + +statement ok +INSERT INTO Scoreboard VALUES ('Simpsons', 'Lisa', 710); + +statement ok +INSERT INTO Scoreboard VALUES ('Simpsons', 'Marge', 990); + +statement ok +INSERT INTO Scoreboard VALUES ('Simpsons', 'Bart', 2010); + +query TTII +SELECT + TeamName, + Player, + Score, + NTILE(2) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s +ORDER BY TeamName, Score; +---- +Mongrels Apu 350 1 +Mongrels Ned 666 1 +Mongrels Meg 1030 2 +Mongrels Burns 1270 2 +Simpsons Homer 1 1 +Simpsons Lisa 710 1 +Simpsons Marge 990 2 +Simpsons Bart 2010 2 + +query TTII +SELECT + TeamName, + Player, + Score, + NTILE(2) OVER (ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s +ORDER BY Score; +---- +Simpsons Homer 1 1 +Mongrels Apu 350 1 +Mongrels Ned 666 1 +Simpsons Lisa 710 1 +Simpsons Marge 990 2 +Mongrels Meg 1030 2 +Mongrels Burns 1270 2 +Simpsons Bart 2010 2 + +query TTII +SELECT + TeamName, + Player, + Score, + NTILE(1000) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s +ORDER BY TeamName, Score; +---- +Mongrels Apu 350 1 +Mongrels Ned 666 2 +Mongrels Meg 1030 3 +Mongrels Burns 1270 4 +Simpsons Homer 1 1 +Simpsons Lisa 710 2 +Simpsons Marge 990 3 +Simpsons Bart 2010 4 + +query TTII +SELECT + TeamName, + Player, + Score, + NTILE(1) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s +ORDER BY TeamName, Score; +---- +Mongrels Apu 350 1 +Mongrels Ned 666 1 +Mongrels Meg 1030 1 +Mongrels Burns 1270 1 +Simpsons Homer 1 1 +Simpsons Lisa 710 1 +Simpsons Marge 990 1 +Simpsons Bart 2010 1 + +# incorrect number of parameters for ntile +query error DataFusion error: Execution error: NTILE requires a positive integer, but finds NULL +SELECT + NTILE(NULL) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s + +query error DataFusion error: Execution error: NTILE requires a positive integer +SELECT + NTILE(-1) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s + +query error DataFusion error: Execution error: NTILE requires a positive integer +SELECT + NTILE(0) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s + +statement error +SELECT + NTILE() OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s + +statement error +SELECT + NTILE(1,2) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s + +statement error +SELECT + NTILE(1,2,3) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s + +statement error +SELECT + NTILE(1,2,3,4) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE +FROM ScoreBoard s + +statement ok +DROP TABLE Scoreboard; From 6f8cbdbe6bc4719b1376ed49d23671082ead6188 Mon Sep 17 00:00:00 2001 From: hhj Date: Fri, 1 Dec 2023 10:13:28 +0800 Subject: [PATCH 2/4] fix comments --- datafusion/physical-expr/src/window/ntile.rs | 5 +- datafusion/sqllogictest/test_files/window.slt | 96 +++++++++---------- 2 files changed, 49 insertions(+), 52 deletions(-) diff --git a/datafusion/physical-expr/src/window/ntile.rs b/datafusion/physical-expr/src/window/ntile.rs index 0998fc77a2ea..f5442e1b0fee 100644 --- a/datafusion/physical-expr/src/window/ntile.rs +++ b/datafusion/physical-expr/src/window/ntile.rs @@ -96,10 +96,7 @@ impl PartitionEvaluator for NtileEvaluator { ) -> Result { let num_rows = num_rows as u64; let mut vec: Vec = Vec::new(); - let mut n = self.n; - if n > num_rows { - n = num_rows; - } + let n = u64::min(self.n, num_rows); for i in 0..num_rows { let res = i * n / num_rows; vec.push(res + 1) diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index b8f5f76f57ad..89109569b6b9 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -3607,40 +3607,40 @@ DROP TABLE t; # NTILE with PARTITION BY, those tests from duckdb: https://github.com/duckdb/duckdb/blob/main/test/sql/window/test_ntile.test statement ok -CREATE TABLE Scoreboard(TeamName VARCHAR, Player VARCHAR, Score INTEGER); +CREATE TABLE score_board(team_name VARCHAR, player VARCHAR, score INTEGER); statement ok -INSERT INTO Scoreboard VALUES ('Mongrels', 'Apu', 350); +INSERT INTO score_board VALUES ('Mongrels', 'Apu', 350); statement ok -INSERT INTO Scoreboard VALUES ('Mongrels', 'Ned', 666); +INSERT INTO score_board VALUES ('Mongrels', 'Ned', 666); statement ok -INSERT INTO Scoreboard VALUES ('Mongrels', 'Meg', 1030); +INSERT INTO score_board VALUES ('Mongrels', 'Meg', 1030); statement ok -INSERT INTO Scoreboard VALUES ('Mongrels', 'Burns', 1270); +INSERT INTO score_board VALUES ('Mongrels', 'Burns', 1270); statement ok -INSERT INTO Scoreboard VALUES ('Simpsons', 'Homer', 1); +INSERT INTO score_board VALUES ('Simpsons', 'Homer', 1); statement ok -INSERT INTO Scoreboard VALUES ('Simpsons', 'Lisa', 710); +INSERT INTO score_board VALUES ('Simpsons', 'Lisa', 710); statement ok -INSERT INTO Scoreboard VALUES ('Simpsons', 'Marge', 990); +INSERT INTO score_board VALUES ('Simpsons', 'Marge', 990); statement ok -INSERT INTO Scoreboard VALUES ('Simpsons', 'Bart', 2010); +INSERT INTO score_board VALUES ('Simpsons', 'Bart', 2010); query TTII SELECT - TeamName, - Player, - Score, - NTILE(2) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s -ORDER BY TeamName, Score; + team_name, + player, + score, + NTILE(2) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s +ORDER BY team_name, score; ---- Mongrels Apu 350 1 Mongrels Ned 666 1 @@ -3653,12 +3653,12 @@ Simpsons Bart 2010 2 query TTII SELECT - TeamName, - Player, - Score, - NTILE(2) OVER (ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s -ORDER BY Score; + team_name, + player, + score, + NTILE(2) OVER (ORDER BY score ASC) AS NTILE +FROM score_board s +ORDER BY score; ---- Simpsons Homer 1 1 Mongrels Apu 350 1 @@ -3671,12 +3671,12 @@ Simpsons Bart 2010 2 query TTII SELECT - TeamName, - Player, - Score, - NTILE(1000) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s -ORDER BY TeamName, Score; + team_name, + player, + score, + NTILE(1000) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s +ORDER BY team_name, score; ---- Mongrels Apu 350 1 Mongrels Ned 666 2 @@ -3689,12 +3689,12 @@ Simpsons Bart 2010 4 query TTII SELECT - TeamName, - Player, - Score, - NTILE(1) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s -ORDER BY TeamName, Score; + team_name, + player, + score, + NTILE(1) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s +ORDER BY team_name, score; ---- Mongrels Apu 350 1 Mongrels Ned 666 1 @@ -3708,38 +3708,38 @@ Simpsons Bart 2010 1 # incorrect number of parameters for ntile query error DataFusion error: Execution error: NTILE requires a positive integer, but finds NULL SELECT - NTILE(NULL) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s + NTILE(NULL) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s query error DataFusion error: Execution error: NTILE requires a positive integer SELECT - NTILE(-1) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s + NTILE(-1) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s query error DataFusion error: Execution error: NTILE requires a positive integer SELECT - NTILE(0) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s + NTILE(0) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s statement error SELECT - NTILE() OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s + NTILE() OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s statement error SELECT - NTILE(1,2) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s + NTILE(1,2) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s statement error SELECT - NTILE(1,2,3) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s + NTILE(1,2,3) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s statement error SELECT - NTILE(1,2,3,4) OVER (PARTITION BY TeamName ORDER BY Score ASC) AS NTILE -FROM ScoreBoard s + NTILE(1,2,3,4) OVER (PARTITION BY team_name ORDER BY score ASC) AS NTILE +FROM score_board s statement ok -DROP TABLE Scoreboard; +DROP TABLE score_board; From 76b3cbf937d49c21e8943101adb44e969feb57e6 Mon Sep 17 00:00:00 2001 From: hhj Date: Fri, 1 Dec 2023 10:50:45 +0800 Subject: [PATCH 3/4] minor --- datafusion/physical-plan/src/windows/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index 11af764839a8..828dcb4b130c 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -191,7 +191,7 @@ fn create_built_in_window_expr( BuiltInWindowFunction::Ntile => { let n = get_scalar_value_from_args(args, 0)?.ok_or_else(|| { DataFusionError::Execution( - "NTILE requires at least 1 argument".to_string(), + "NTILE requires a positive integer".to_string(), ) })?; From dad0270bbada9353a43587af50fbf7543b086f37 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 1 Dec 2023 14:51:17 +0800 Subject: [PATCH 4/4] Update datafusion/sqllogictest/test_files/window.slt Co-authored-by: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> --- datafusion/sqllogictest/test_files/window.slt | 34 +++++-------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 89109569b6b9..bb6ca119480d 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -3607,31 +3607,15 @@ DROP TABLE t; # NTILE with PARTITION BY, those tests from duckdb: https://github.com/duckdb/duckdb/blob/main/test/sql/window/test_ntile.test statement ok -CREATE TABLE score_board(team_name VARCHAR, player VARCHAR, score INTEGER); - -statement ok -INSERT INTO score_board VALUES ('Mongrels', 'Apu', 350); - -statement ok -INSERT INTO score_board VALUES ('Mongrels', 'Ned', 666); - -statement ok -INSERT INTO score_board VALUES ('Mongrels', 'Meg', 1030); - -statement ok -INSERT INTO score_board VALUES ('Mongrels', 'Burns', 1270); - -statement ok -INSERT INTO score_board VALUES ('Simpsons', 'Homer', 1); - -statement ok -INSERT INTO score_board VALUES ('Simpsons', 'Lisa', 710); - -statement ok -INSERT INTO score_board VALUES ('Simpsons', 'Marge', 990); - -statement ok -INSERT INTO score_board VALUES ('Simpsons', 'Bart', 2010); +CREATE TABLE score_board (team_name VARCHAR, player VARCHAR, score INTEGER) as VALUES + ('Mongrels', 'Apu', 350), + ('Mongrels', 'Ned', 666), + ('Mongrels', 'Meg', 1030), + ('Mongrels', 'Burns', 1270), + ('Simpsons', 'Homer', 1), + ('Simpsons', 'Lisa', 710), + ('Simpsons', 'Marge', 990), + ('Simpsons', 'Bart', 2010) query TTII SELECT