diff --git a/docs/sql-ref-syntax-qry-select-groupby.md b/docs/sql-ref-syntax-qry-select-groupby.md index b81a5e43d57bf..d7827f88801d4 100644 --- a/docs/sql-ref-syntax-qry-select-groupby.md +++ b/docs/sql-ref-syntax-qry-select-groupby.md @@ -24,8 +24,8 @@ license: | The `GROUP BY` clause is used to group the rows based on a set of specified grouping expressions and compute aggregations on the group of rows based on one or more specified aggregate functions. Spark also supports advanced aggregations to do multiple aggregations for the same input record set via `GROUPING SETS`, `CUBE`, `ROLLUP` clauses. -The grouping expressions and advanced aggregations can be mixed in the `GROUP BY` clause. -See more details in the `Mixed Grouping Analytics` section. When a FILTER clause is attached to +The grouping expressions and advanced aggregations can be mixed in the `GROUP BY` clause and nested in a `GROUPING SETS` clause. +See more details in the `Mixed/Nested Grouping Analytics` section. When a FILTER clause is attached to an aggregate function, only the matching rows are passed to that function. ### Syntax @@ -95,13 +95,17 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex (product, warehouse, location), (warehouse), (product), (warehouse, product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. -* **Mixed Grouping Analytics** +* **Mixed/Nested Grouping Analytics** - A GROUP BY clause can include multiple `group_expression`s and multiple `CUBE|ROLLUP|GROUPING SETS`s. + A GROUP BY clause can include multiple `group_expression`s and multiple `CUBE|ROLLUP|GROUPING SETS`s. + `GROUPING SETS` can also have nested `CUBE|ROLLUP|GROUPING SETS` clauses, e.g. + `GROUPING SETS(ROLLUP(warehouse, location), CUBE(warehouse, location))`, + `GROUPING SETS(warehouse, GROUPING SETS(location, GROUPING SETS(ROLLUP(warehouse, location), CUBE(warehouse, location))))`. `CUBE|ROLLUP` is just a syntax sugar for `GROUPING SETS`, please refer to the sections above for how to translate `CUBE|ROLLUP` to `GROUPING SETS`. `group_expression` can be treated as a single-group `GROUPING SETS` under this context. For multiple `GROUPING SETS` in the `GROUP BY` clause, we generate - a single `GROUPING SETS` by doing a cross-product of the original `GROUPING SETS`s. For example, + a single `GROUPING SETS` by doing a cross-product of the original `GROUPING SETS`s. For nested `GROUPING SETS` in the `GROUPING SETS` clause, + we simply take its grouping sets and strip it. For example, `GROUP BY warehouse, GROUPING SETS((product), ()), GROUPING SETS((location, size), (location), (size), ())` and `GROUP BY warehouse, ROLLUP(product), CUBE(location, size)` is equivalent to `GROUP BY GROUPING SETS( @@ -113,7 +117,10 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex (warehouse, location), (warehouse, size), (warehouse))`. - + + `GROUP BY GROUPING SETS(GROUPING SETS(warehouse), GROUPING SETS((warehouse, product)))` is equivalent to + `GROUP BY GROUPING SETS((warehouse), (warehouse, product))`. + * **aggregate_name** Specifies an aggregate function name (MIN, MAX, COUNT, SUM, AVG, etc.). diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index c958f9c387767..55666870548be 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -606,7 +606,13 @@ groupByClause ; groupingAnalytics - : (ROLLUP | CUBE | GROUPING SETS) '(' groupingSet (',' groupingSet)* ')' + : (ROLLUP | CUBE) '(' groupingSet (',' groupingSet)* ')' + | GROUPING SETS '(' groupingElement (',' groupingElement)* ')' + ; + +groupingElement + : groupingAnalytics + | groupingSet ; groupingSet diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index f9317e865f275..eb61fa0c94e4a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -994,26 +994,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg .map(groupByExpr => { val groupingAnalytics = groupByExpr.groupingAnalytics if (groupingAnalytics != null) { - val groupingSets = groupingAnalytics.groupingSet.asScala - .map(_.expression.asScala.map(e => expression(e)).toSeq) - if (groupingAnalytics.CUBE != null) { - // CUBE(A, B, (A, B), ()) is not supported. - if (groupingSets.exists(_.isEmpty)) { - throw new ParseException("Empty set in CUBE grouping sets is not supported.", - groupingAnalytics) - } - Cube(groupingSets.toSeq) - } else if (groupingAnalytics.ROLLUP != null) { - // ROLLUP(A, B, (A, B), ()) is not supported. - if (groupingSets.exists(_.isEmpty)) { - throw new ParseException("Empty set in ROLLUP grouping sets is not supported.", - groupingAnalytics) - } - Rollup(groupingSets.toSeq) - } else { - assert(groupingAnalytics.GROUPING != null && groupingAnalytics.SETS != null) - GroupingSets(groupingSets.toSeq) - } + visitGroupingAnalytics(groupingAnalytics) } else { expression(groupByExpr.expression) } @@ -1022,6 +1003,38 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } } + override def visitGroupingAnalytics( + groupingAnalytics: GroupingAnalyticsContext): BaseGroupingSets = { + val groupingSets = groupingAnalytics.groupingSet.asScala + .map(_.expression.asScala.map(e => expression(e)).toSeq) + if (groupingAnalytics.CUBE != null) { + // CUBE(A, B, (A, B), ()) is not supported. + if (groupingSets.exists(_.isEmpty)) { + throw new ParseException("Empty set in CUBE grouping sets is not supported.", + groupingAnalytics) + } + Cube(groupingSets.toSeq) + } else if (groupingAnalytics.ROLLUP != null) { + // ROLLUP(A, B, (A, B), ()) is not supported. + if (groupingSets.exists(_.isEmpty)) { + throw new ParseException("Empty set in ROLLUP grouping sets is not supported.", + groupingAnalytics) + } + Rollup(groupingSets.toSeq) + } else { + assert(groupingAnalytics.GROUPING != null && groupingAnalytics.SETS != null) + val groupingSets = groupingAnalytics.groupingElement.asScala.flatMap { expr => + val groupingAnalytics = expr.groupingAnalytics() + if (groupingAnalytics != null) { + visitGroupingAnalytics(groupingAnalytics).selectedGroupByExprs + } else { + Seq(expr.groupingSet().expression().asScala.map(e => expression(e)).toSeq) + } + } + GroupingSets(groupingSets.toSeq) + } + } + /** * Add [[UnresolvedHint]]s to a logical plan. */ diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql index 6dfe31e2706e4..d6381e59e0d84 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql @@ -80,3 +80,14 @@ SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), (a), ()); SELECT a, b, count(1) FROM testData GROUP BY a, CUBE(a, b), GROUPING SETS((a, b), (a), ()); SELECT a, b, count(1) FROM testData GROUP BY a, CUBE(a, b), ROLLUP(a, b), GROUPING SETS((a, b), (a), ()); +-- Support nested CUBE/ROLLUP/GROUPING SETS in GROUPING SETS +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b)); +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ())); + +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), GROUPING SETS(ROLLUP(a, b))); +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b, a, b), (a, b, a), (a, b)); +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b, a, b), (a, b, a), (a, b))); + +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b), CUBE(a, b)); +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ()), GROUPING SETS((a, b), (a), (b), ())); +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), (a), (), (a, b), (a), (b), ()); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/group-analytics.sql.out index 1db8febb81f9e..9dbfc4cf4fa18 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/group-analytics.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 44 +-- Number of queries: 52 -- !query @@ -1067,3 +1067,227 @@ struct 3 NULL 2 3 NULL 2 3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b)) +-- !query schema +struct +-- !query output +1 1 1 +1 2 1 +1 NULL 2 +1 NULL 2 +2 1 1 +2 2 1 +2 NULL 2 +2 NULL 2 +3 1 1 +3 2 1 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ())) +-- !query schema +struct +-- !query output +1 1 1 +1 2 1 +1 NULL 2 +1 NULL 2 +2 1 1 +2 2 1 +2 NULL 2 +2 NULL 2 +3 1 1 +3 2 1 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), GROUPING SETS(ROLLUP(a, b))) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b, a, b), (a, b, a), (a, b)) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b, a, b), (a, b, a), (a, b))) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b), CUBE(a, b)) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ()), GROUPING SETS((a, b), (a), (b), ())) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), (a), (), (a, b), (a), (b), ()) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 +3 NULL 2 +3 NULL 2 diff --git a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out index 6dc02ead9daab..f249908163d01 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 44 +-- Number of queries: 52 -- !query @@ -1087,3 +1087,227 @@ struct 3 NULL 2 3 NULL 2 3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b)) +-- !query schema +struct +-- !query output +1 1 1 +1 2 1 +1 NULL 2 +1 NULL 2 +2 1 1 +2 2 1 +2 NULL 2 +2 NULL 2 +3 1 1 +3 2 1 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ())) +-- !query schema +struct +-- !query output +1 1 1 +1 2 1 +1 NULL 2 +1 NULL 2 +2 1 1 +2 2 1 +2 NULL 2 +2 NULL 2 +3 1 1 +3 2 1 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), GROUPING SETS(ROLLUP(a, b))) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b, a, b), (a, b, a), (a, b)) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b, a, b), (a, b, a), (a, b))) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(ROLLUP(a, b), CUBE(a, b)) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS(GROUPING SETS((a, b), (a), ()), GROUPING SETS((a, b), (a), (b), ())) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 +3 NULL 2 +3 NULL 2 + + +-- !query +SELECT a, b, count(1) FROM testData GROUP BY a, GROUPING SETS((a, b), (a), (), (a, b), (a), (b), ()) +-- !query schema +struct +-- !query output +1 1 1 +1 1 1 +1 1 1 +1 2 1 +1 2 1 +1 2 1 +1 NULL 2 +1 NULL 2 +1 NULL 2 +1 NULL 2 +2 1 1 +2 1 1 +2 1 1 +2 2 1 +2 2 1 +2 2 1 +2 NULL 2 +2 NULL 2 +2 NULL 2 +2 NULL 2 +3 1 1 +3 1 1 +3 1 1 +3 2 1 +3 2 1 +3 2 1 +3 NULL 2 +3 NULL 2 +3 NULL 2 +3 NULL 2