Add support to MaxBy and MinBy in Qualification tool (#1335)

Signed-off-by: Ahmed Hussein <[email protected]> Fixes #1329 - Add `MaxBy` and `MinBy` to the supported expressions CSV files - Update the score sheets - Remove the override from the `override_supported_configs.json` - Add a unit test to verify that `min_by` and `max_by` are marked as supported when appearing in hashAggregate
NVIDIA · Sep 6, 2024 · 12adc3f · 12adc3f
1 parent 4f2d6e0
commit 12adc3f
Show file tree

Hide file tree

Showing 15 changed files with 64 additions and 133 deletions.
diff --git a/core/src/main/resources/operatorsScore-databricks-aws-a10G.csv b/core/src/main/resources/operatorsScore-databricks-aws-a10G.csv
@@ -302,3 +302,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-databricks-aws-t4.csv b/core/src/main/resources/operatorsScore-databricks-aws-t4.csv
@@ -302,3 +302,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-databricks-azure-t4.csv b/core/src/main/resources/operatorsScore-databricks-azure-t4.csv
@@ -290,3 +290,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv
@@ -284,3 +284,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv
@@ -284,3 +284,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv
@@ -290,3 +290,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv
@@ -284,3 +284,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv
@@ -290,3 +290,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv
@@ -290,3 +290,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-emr-a10G.csv b/core/src/main/resources/operatorsScore-emr-a10G.csv
@@ -290,3 +290,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv
@@ -290,3 +290,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/operatorsScore-onprem-a100.csv b/core/src/main/resources/operatorsScore-onprem-a100.csv
@@ -302,3 +302,5 @@ BoundReference,1.5
 HiveHash,1.5
 MapFromArrays,1.5
 DecimalSum,1.5
+MaxBy,1.5
+MinBy,1.5
diff --git a/core/src/main/resources/supportedExprs.csv b/core/src/main/resources/supportedExprs.csv
@@ -713,12 +713,24 @@ Max,S,`max`,None,reduction,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,N
 Max,S,`max`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
 Max,S,`max`,None,window,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
 Max,S,`max`,None,window,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
+MaxBy,S,`max_by`,None,aggregation,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
+MaxBy,S,`max_by`,None,aggregation,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
+MaxBy,S,`max_by`,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
+MaxBy,S,`max_by`,None,reduction,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
+MaxBy,S,`max_by`,None,reduction,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
+MaxBy,S,`max_by`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 Min,S,`min`,None,aggregation,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
 Min,S,`min`,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
 Min,S,`min`,None,reduction,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
 Min,S,`min`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
 Min,S,`min`,None,window,input,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
 Min,S,`min`,None,window,result,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NA,NS,NS,NA,NA
+MinBy,S,`min_by`,None,aggregation,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
+MinBy,S,`min_by`,None,aggregation,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
+MinBy,S,`min_by`,None,aggregation,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
+MinBy,S,`min_by`,None,reduction,value,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
+MinBy,S,`min_by`,None,reduction,ordering,S,S,S,S,S,NS,NS,S,PS,S,S,S,NS,NS,PS,NA,PS,NS,NA,NA
+MinBy,S,`min_by`,None,reduction,result,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS
 Percentile,S,`percentile`,None,aggregation,input,NA,S,S,S,S,S,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
 Percentile,S,`percentile`,None,aggregation,percentage,NA,NA,NA,NA,NA,NA,PS,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA
 Percentile,S,`percentile`,None,aggregation,frequency,NA,NA,NA,NA,S,NA,NA,NA,NA,NA,NA,NA,NA,NA,S,NA,NA,NA,NA,NA

diff --git a/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala b/core/src/test/scala/com/nvidia/spark/rapids/tool/planparser/SqlPlanParserSuite.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.rapids.tool.ToolUtils
 import org.apache.spark.sql.rapids.tool.qualification.QualificationAppInfo
-import org.apache.spark.sql.rapids.tool.util.{RapidsToolsConfUtil, ToolsPlanGraph, UTF8Source}
+import org.apache.spark.sql.rapids.tool.util.{FSUtils, RapidsToolsConfUtil, ToolsPlanGraph, UTF8Source}
 
 class SQLPlanParserSuite extends BaseTestSuite {
 
@@ -1832,4 +1832,31 @@ class SQLPlanParserSuite extends BaseTestSuite {
         "CheckOverflowInTableInsert should not exist in the physical plan.")
     }
   }
+
+  test("MinBy and MaxBy are supported") {
+    // Test that aggregates minBy and MAxBy are marked as supported by the qualification tool.
+    TrampolineUtil.withTempDir { eventLogDir =>
+      val (eventLog, _) = ToolTestUtils.generateEventLog(eventLogDir, "minMaxBy") { spark =>
+        import spark.implicits._
+        val testData = Seq((1, 2, 3), (1, 4, 6), (2, 3, 4)).toDF("a", "b", "c")
+        testData.createOrReplaceTempView("t1")
+        spark.sql("SELECT a, min_by(b, c), max_by(b, c) FROM t1 GROUP BY a")
+      }
+      // validate that the eventlog contains MinBy and MaxBy
+      val reader = FSUtils.readFileContentAsUTF8(eventLog)
+      assert(reader.contains("min_by"))
+      assert(reader.contains("max_by"))
+      // run the qualification tool
+      val pluginTypeChecker = new PluginTypeChecker()
+      val app = createAppFromEventlog(eventLog)
+      val parsedPlans = app.sqlPlans.map { case (sqlID, plan) =>
+        SQLPlanParser.parseSQLPlan(app.appId, plan, sqlID, "", pluginTypeChecker, app)
+      }
+      // we should have 2 hash aggregates with min_by and max_by expressions
+      // if the min_by and max_by were not recognized, the test would fail
+      val hashAggExecs =
+        getAllExecsFromPlan(parsedPlans.toSeq).filter(_.exec.equals("HashAggregate"))
+      assertSizeAndSupported(2, hashAggExecs)
+    }
+  }
 }
diff --git a/scripts/sync_plugin_files/override_supported_configs.json b/scripts/sync_plugin_files/override_supported_configs.json
@@ -274,138 +274,6 @@
           "value": "`hive-hash`"
         }
       ]
-    },
-    {
-      "Expression": "MaxBy",
-      "Context": "aggregation",
-      "Params": "value",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MaxBy",
-      "Context": "aggregation",
-      "Params": "ordering",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MaxBy",
-      "Context": "aggregation",
-      "Params": "result",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MaxBy",
-      "Context": "reduction",
-      "Params": "value",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MaxBy",
-      "Context": "reduction",
-      "Params": "ordering",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MaxBy",
-      "Context": "reduction",
-      "Params": "result",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MinBy",
-      "Context": "aggregation",
-      "Params": "value",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MinBy",
-      "Context": "aggregation",
-      "Params": "ordering",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MinBy",
-      "Context": "aggregation",
-      "Params": "result",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MinBy",
-      "Context": "reduction",
-      "Params": "value",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MinBy",
-      "Context": "reduction",
-      "Params": "ordering",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
-    },
-    {
-      "Expression": "MinBy",
-      "Context": "reduction",
-      "Params": "result",
-      "override": [
-        {
-          "key": "Supported",
-          "value": "TNEW"
-        }
-      ]
     }
   ]
 }