diff --git a/core/src/main/resources/operatorsScore-databricks-aws-a10G.csv b/core/src/main/resources/operatorsScore-databricks-aws-a10G.csv index 4bfd05272..81d5edc82 100644 --- a/core/src/main/resources/operatorsScore-databricks-aws-a10G.csv +++ b/core/src/main/resources/operatorsScore-databricks-aws-a10G.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,2.45 +MapInArrowExec,2.45 WindowInPandasExec,1.2 KMeans-pyspark,8.86 KMeans-scala,1.0 diff --git a/core/src/main/resources/operatorsScore-databricks-aws-t4.csv b/core/src/main/resources/operatorsScore-databricks-aws-t4.csv index 4bfd05272..81d5edc82 100644 --- a/core/src/main/resources/operatorsScore-databricks-aws-t4.csv +++ b/core/src/main/resources/operatorsScore-databricks-aws-t4.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,2.45 +MapInArrowExec,2.45 WindowInPandasExec,1.2 KMeans-pyspark,8.86 KMeans-scala,1.0 diff --git a/core/src/main/resources/operatorsScore-databricks-azure-t4.csv b/core/src/main/resources/operatorsScore-databricks-azure-t4.csv index bcee82935..cb84688e5 100644 --- a/core/src/main/resources/operatorsScore-databricks-azure-t4.csv +++ b/core/src/main/resources/operatorsScore-databricks-azure-t4.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,2.73 +MapInArrowExec,2.73 WindowInPandasExec,1.2 RoundCeil,2.73 RoundFloor,2.73 diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv index 7cb7e508c..566d26b0c 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-l4.csv @@ -267,6 +267,8 @@ ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 FlatMapCoGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,3.74 +MapInArrowExec,3.74 WindowInPandasExec,1.2 RoundCeil,3.74 RoundFloor,3.74 diff --git a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv index 44224074e..d1b3376c9 100644 --- a/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-gke-t4.csv @@ -267,6 +267,8 @@ ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 FlatMapCoGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,3.65 +MapInArrowExec,3.65 WindowInPandasExec,1.2 RoundCeil,3.65 RoundFloor,3.65 diff --git a/core/src/main/resources/operatorsScore-dataproc-l4.csv b/core/src/main/resources/operatorsScore-dataproc-l4.csv index 629133560..4cedbd62c 100644 --- a/core/src/main/resources/operatorsScore-dataproc-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-l4.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,4.16 +MapInArrowExec,4.16 WindowInPandasExec,1.2 RoundCeil,4.16 RoundFloor,4.16 diff --git a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv index 68ed9d3ef..5d724c037 100644 --- a/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-serverless-l4.csv @@ -267,6 +267,8 @@ ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 FlatMapCoGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,4.25 +MapInArrowExec,4.25 WindowInPandasExec,1.2 RoundCeil,4.25 RoundFloor,4.25 diff --git a/core/src/main/resources/operatorsScore-dataproc-t4.csv b/core/src/main/resources/operatorsScore-dataproc-t4.csv index 9b7f86433..42310164a 100644 --- a/core/src/main/resources/operatorsScore-dataproc-t4.csv +++ b/core/src/main/resources/operatorsScore-dataproc-t4.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,4.88 +MapInArrowExec,4.88 WindowInPandasExec,1.2 RoundCeil,4.88 RoundFloor,4.88 diff --git a/core/src/main/resources/operatorsScore-emr-a10.csv b/core/src/main/resources/operatorsScore-emr-a10.csv index 2cfc40348..2cd9d57a5 100644 --- a/core/src/main/resources/operatorsScore-emr-a10.csv +++ b/core/src/main/resources/operatorsScore-emr-a10.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,2.59 +MapInArrowExec,2.59 WindowInPandasExec,1.2 RoundCeil,2.59 RoundFloor,2.59 diff --git a/core/src/main/resources/operatorsScore-emr-a10G.csv b/core/src/main/resources/operatorsScore-emr-a10G.csv index 2cfc40348..2cd9d57a5 100644 --- a/core/src/main/resources/operatorsScore-emr-a10G.csv +++ b/core/src/main/resources/operatorsScore-emr-a10G.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,2.59 +MapInArrowExec,2.59 WindowInPandasExec,1.2 RoundCeil,2.59 RoundFloor,2.59 diff --git a/core/src/main/resources/operatorsScore-emr-t4.csv b/core/src/main/resources/operatorsScore-emr-t4.csv index b2c3fba8f..e738376f1 100644 --- a/core/src/main/resources/operatorsScore-emr-t4.csv +++ b/core/src/main/resources/operatorsScore-emr-t4.csv @@ -273,6 +273,8 @@ AggregateInPandasExec,1.2 ArrowEvalPythonExec,1.2 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,2.07 +MapInArrowExec,2.07 WindowInPandasExec,1.2 RoundCeil,2.07 RoundFloor,2.07 diff --git a/core/src/main/resources/operatorsScore-onprem-a100.csv b/core/src/main/resources/operatorsScore-onprem-a100.csv index f4a39170f..3ac26e5ac 100644 --- a/core/src/main/resources/operatorsScore-onprem-a100.csv +++ b/core/src/main/resources/operatorsScore-onprem-a100.csv @@ -38,6 +38,8 @@ ArrowEvalPythonExec,1.2 FlatMapCoGroupsInPandasExec,3.0 FlatMapGroupsInPandasExec,1.2 MapInPandasExec,1.2 +PythonMapInArrowExec,3.0 +MapInArrowExec,3.0 WindowInPandasExec,1.2 WindowExec,3.0 WindowGroupLimitExec,3.0 diff --git a/core/src/main/resources/supportedExecs.csv b/core/src/main/resources/supportedExecs.csv index 1116737d9..df7815e48 100644 --- a/core/src/main/resources/supportedExecs.csv +++ b/core/src/main/resources/supportedExecs.csv @@ -49,7 +49,8 @@ ArrowEvalPythonExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,P FlatMapCoGroupsInPandasExec,NS,This is disabled by default because Performance is not ideal with many small groups,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,NS,NS,NS,NS,NS,NS FlatMapGroupsInPandasExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,NS,NS,NS,NS,NS,NS MapInPandasExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS,NS,NS -PythonMapInArrowExec,TNEW,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS,NS,NS +PythonMapInArrowExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS,NS,NS +MapInArrowExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,PS,NS,NS,NS WindowInPandasExec,NS,This is disabled by default because it only supports row based frame for now,Input/Output,S,S,S,S,S,S,S,S,PS,S,NS,NS,NS,NS,PS,NS,NS,NS,NS,NS WindowExec,S,None,partitionSpec,S,S,S,S,S,S,S,S,PS,S,S,S,NS,NS,NS,NS,PS,NS,NS,NS WindowExec,S,None,Input/Output,S,S,S,S,S,S,S,S,PS,S,S,S,S,NS,PS,PS,PS,NS,NS,NS diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/PythonMapInArrowExecParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/PythonMapInArrowExecParser.scala new file mode 100644 index 000000000..08f8eedef --- /dev/null +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/PythonMapInArrowExecParser.scala @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.tool.planparser + +import com.nvidia.spark.rapids.tool.qualification.PluginTypeChecker + +import org.apache.spark.sql.execution.ui.SparkPlanGraphNode + +case class PythonMapInArrowExecParser( + node: SparkPlanGraphNode, + checker: PluginTypeChecker, + sqlID: Long) extends ExecParser { + + val fullExecName: String = node.name + "Exec" + + override def parse: ExecInfo = { + // PythonMapInArrow doesn't have duration + val duration = None + val (speedupFactor, isSupported) = if (checker.isExecSupported(fullExecName)) { + (checker.getSpeedupFactor(fullExecName), true) + } else { + (1.0, false) + } + // TODO - add in parsing expressions - average speedup across? + ExecInfo(node, sqlID, node.name, "", speedupFactor, duration, node.id, isSupported, None) + } +} diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala index c76ae7e73..a82337dd5 100644 --- a/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala +++ b/core/src/main/scala/com/nvidia/spark/rapids/tool/planparser/SQLPlanParser.scala @@ -513,6 +513,8 @@ object SQLPlanParser extends Logging { ObjectHashAggregateExecParser(node, checker, sqlID, app).parse case "Project" => ProjectExecParser(node, checker, sqlID).parse + case "PythonMapInArrow" | "MapInArrow" => + PythonMapInArrowExecParser(node, checker, sqlID).parse case "Range" => RangeExecParser(node, checker, sqlID).parse case "Sample" => diff --git a/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala b/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala index 704e7d00e..b736460ac 100644 --- a/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala +++ b/core/src/main/scala/org/apache/spark/sql/rapids/tool/ToolUtils.scala @@ -362,7 +362,7 @@ object ExecHelper { // we don't want to mark the *InPandas and ArrowEvalPythonExec as unsupported with UDF private val skipUDFCheckExecs = Seq("ArrowEvalPython", "AggregateInPandas", - "FlatMapGroupsInPandas", "MapInPandas", "WindowInPandas") + "FlatMapGroupsInPandas", "MapInPandas", "WindowInPandas", "PythonMapInArrow", "MapInArrow") // Set containing execs that should be labeled as "shouldRemove" private val execsToBeRemoved = Set( diff --git a/scripts/sync_plugin_files/override_supported_configs.json b/scripts/sync_plugin_files/override_supported_configs.json index 6c92ad145..9fa844ef7 100644 --- a/scripts/sync_plugin_files/override_supported_configs.json +++ b/scripts/sync_plugin_files/override_supported_configs.json @@ -450,16 +450,6 @@ "value": "TNEW" } ] - }, - { - "Exec": "PythonMapInArrowExec", - "Params": "Input/Output", - "override": [ - { - "key": "Supported", - "value": "TNEW" - } - ] } ] } \ No newline at end of file