Skip to content

Commit

Permalink
Add support to MaxBy and MinBy in Qualification tool (#1335)
Browse files Browse the repository at this point in the history
Signed-off-by: Ahmed Hussein <[email protected]>

Fixes #1329

- Add `MaxBy` and `MinBy` to the supported expressions CSV files
- Update the score sheets
- Remove the override from the `override_supported_configs.json`
- Add a unit test to verify that `min_by` and `max_by` are marked as
  supported when appearing in hashAggregate
  • Loading branch information
amahussein authored Sep 6, 2024
1 parent 4f2d6e0 commit 12adc3f
Show file tree
Hide file tree
Showing 15 changed files with 64 additions and 133 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-databricks-aws-t4.csv
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-dataproc-gke-l4.csv
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-dataproc-gke-t4.csv
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-dataproc-l4.csv
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-dataproc-t4.csv
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-emr-a10.csv
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-emr-a10G.csv
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-emr-t4.csv
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
2 changes: 2 additions & 0 deletions core/src/main/resources/operatorsScore-onprem-a100.csv
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,5 @@ BoundReference,1.5
HiveHash,1.5
MapFromArrays,1.5
DecimalSum,1.5
MaxBy,1.5
MinBy,1.5
12 changes: 12 additions & 0 deletions core/src/main/resources/supportedExprs.csv
Original file line number Diff line number Diff line change
Expand Up @@ -713,12 +713,24 @@ Max,S,`max`,None,reduction,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,N
Max,S,`max`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
Max,S,`max`,None,window,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
Max,S,`max`,None,window,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
MaxBy,S,`max_by`,None,aggregation,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
MaxBy,S,`max_by`,None,aggregation,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
MaxBy,S,`max_by`,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
MaxBy,S,`max_by`,None,reduction,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
MaxBy,S,`max_by`,None,reduction,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
MaxBy,S,`max_by`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
Min,S,`min`,None,aggregation,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
Min,S,`min`,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
Min,S,`min`,None,reduction,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
Min,S,`min`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
Min,S,`min`,None,window,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
Min,S,`min`,None,window,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
MinBy,S,`min_by`,None,aggregation,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
MinBy,S,`min_by`,None,aggregation,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
MinBy,S,`min_by`,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
MinBy,S,`min_by`,None,reduction,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
MinBy,S,`min_by`,None,reduction,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
MinBy,S,`min_by`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
Percentile,S,`percentile`,None,aggregation,input,NA,S,S,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
Percentile,S,`percentile`,None,aggregation,percentage,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
Percentile,S,`percentile`,None,aggregation,frequency,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.rapids.tool.ToolUtils
import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
import org.apache.spark.sql.rapids.tool.util.{RapidsToolsConfUtil, ToolsPlanGraph, UTF8Source}
import org.apache.spark.sql.rapids.tool.util.{FSUtils, RapidsToolsConfUtil, ToolsPlanGraph, UTF8Source}

class SQLPlanParserSuite extends BaseTestSuite {

Expand Down Expand Up @@ -1832,4 +1832,31 @@ class SQLPlanParserSuite extends BaseTestSuite {
"CheckOverflowInTableInsert should not exist in the physical plan.")
}
}

test("MinBy and MaxBy are supported") {
// Test that aggregates minBy and MAxBy are marked as supported by the qualification tool.
TrampolineUtil.withTempDir { eventLogDir =>
val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, "minMaxBy") { spark =>
import spark.implicits._
val testData = Seq((1, 2, 3), (1, 4, 6), (2, 3, 4)).toDF("a", "b", "c")
testData.createOrReplaceTempView("t1")
spark.sql("SELECT a, min_by(b, c), max_by(b, c) FROM t1 GROUP BY a")
}
// validate that the eventlog contains MinBy and MaxBy
val reader = FSUtils.readFileContentAsUTF8(eventLog)
assert(reader.contains("min_by"))
assert(reader.contains("max_by"))
// run the qualification tool
val pluginTypeChecker = new PluginTypeChecker()
val app = createAppFromEventlog(eventLog)
val parsedPlans = app.sqlPlans.map { case (sqlID, plan) =>
SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app)
}
// we should have 2 hash aggregates with min_by and max_by expressions
// if the min_by and max_by were not recognized, the test would fail
val hashAggExecs =
getAllExecsFromPlan(parsedPlans.toSeq).filter(_.exec.equals("HashAggregate"))
assertSizeAndSupported(2, hashAggExecs)
}
}
}
132 changes: 0 additions & 132 deletions scripts/sync_plugin_files/override_supported_configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -274,138 +274,6 @@
"value": "`hive-hash`"
}
]
},
{
"Expression": "MaxBy",
"Context": "aggregation",
"Params": "value",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MaxBy",
"Context": "aggregation",
"Params": "ordering",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MaxBy",
"Context": "aggregation",
"Params": "result",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MaxBy",
"Context": "reduction",
"Params": "value",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MaxBy",
"Context": "reduction",
"Params": "ordering",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MaxBy",
"Context": "reduction",
"Params": "result",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MinBy",
"Context": "aggregation",
"Params": "value",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MinBy",
"Context": "aggregation",
"Params": "ordering",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MinBy",
"Context": "aggregation",
"Params": "result",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MinBy",
"Context": "reduction",
"Params": "value",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MinBy",
"Context": "reduction",
"Params": "ordering",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
},
{
"Expression": "MinBy",
"Context": "reduction",
"Params": "result",
"override": [
{
"key": "Supported",
"value": "TNEW"
}
]
}
]
}

0 comments on commit 12adc3f

Please sign in to comment.