[SPARK-30751][SQL] Combine the skewed readers into one in AQE skew jo…

…in optimizations  ### What changes were proposed in this pull request?  This is a followup of #26434 This PR use one special shuffle reader for skew join, so that we only have one join after optimization. In order to do that, this PR 1. add a very general `CustomShuffledRowRDD` which support all kind of partition arrangement. 2. move the logic of coalescing shuffle partitions to a util function, and call it during skew join optimization, to totally decouple with the `ReduceNumShufflePartitions` rule. It's too complicated to interfere skew join with `ReduceNumShufflePartitions`, as you need to consider the size of split partitions which don't respect target size already. ### Why are the changes needed?  The current skew join optimization has a serious performance issue: the size of the query plan depends on the number and size of skewed partitions. ### Does this PR introduce any user-facing change?  no ### How was this patch tested?  existing tests test UI manually: ![image](https://user-images.githubusercontent.com/3182036/74357390-cfb30480-4dfa-11ea-83f6-825d1b9379ca.png) explain output ``` AdaptiveSparkPlan(isFinalPlan=true) +- OverwriteByExpression org.apache.spark.sql.execution.datasources.noop.NoopTable$403a2ed5, [AlwaysTrue()], org.apache.spark.sql.util.CaseInsensitiveStringMap1f +- *(5) SortMergeJoin(skew=true) [key1#2L], [key2#6L], Inner :- *(3) Sort [key1#2L ASC NULLS FIRST], false, 0 : +- SkewJoinShuffleReader 2 skewed partitions with size(max=5 KB, min=5 KB, avg=5 KB) : +- ShuffleQueryStage 0 : +- Exchange hashpartitioning(key1#2L, 200), true, [id=#53] : +- *(1) Project [(id#0L % 2) AS key1#2L] : +- *(1) Filter isnotnull((id#0L % 2)) : +- *(1) Range (0, 100000, step=1, splits=6) +- *(4) Sort [key2#6L ASC NULLS FIRST], false, 0 +- SkewJoinShuffleReader 2 skewed partitions with size(max=5 KB, min=5 KB, avg=5 KB) +- ShuffleQueryStage 1 +- Exchange hashpartitioning(key2#6L, 200), true, [id=#64] +- *(2) Project [((id#4L % 2) + 1) AS key2#6L] +- *(2) Filter isnotnull(((id#4L % 2) + 1)) +- *(2) Range (0, 100000, step=1, splits=6) ``` Closes #27493 from cloud-fan/aqe. Authored-by: Wenchen Fan <[email protected]> Signed-off-by: herman <[email protected]> (cherry picked from commit a4ceea6) Signed-off-by: herman <[email protected]>
apache · Feb 13, 2020 · 78bd4b3 · 78bd4b3
1 parent 82258aa
commit 78bd4b3
Show file tree

Hide file tree

Showing 12 changed files with 741 additions and 703 deletions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -116,7 +116,7 @@ class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: A
 class ShuffledRowRDD(
     var dependency: ShuffleDependency[Int, InternalRow, InternalRow],
     metrics: Map[String, SQLMetric],
-    specifiedPartitionIndices: Option[Array[(Int, Int)]] = None)
+    specifiedPartitionStartIndices: Option[Array[Int]] = None)
   extends RDD[InternalRow](dependency.rdd.context, Nil) {
 
   if (SQLConf.get.fetchShuffleBlocksInBatchEnabled) {
@@ -126,8 +126,8 @@ class ShuffledRowRDD(
 
   private[this] val numPreShufflePartitions = dependency.partitioner.numPartitions
 
-  private[this] val partitionStartIndices: Array[Int] = specifiedPartitionIndices match {
-    case Some(indices) => indices.map(_._1)
+  private[this] val partitionStartIndices: Array[Int] = specifiedPartitionStartIndices match {
+    case Some(indices) => indices
     case None =>
       // When specifiedPartitionStartIndices is not defined, every post-shuffle partition
       // corresponds to a pre-shuffle partition.
@@ -142,15 +142,16 @@ class ShuffledRowRDD(
   override val partitioner: Option[Partitioner] = Some(part)
 
   override def getPartitions: Array[Partition] = {
-    specifiedPartitionIndices match {
-      case Some(indices) =>
-        Array.tabulate[Partition](indices.length) { i =>
-          new ShuffledRowRDDPartition(i, indices(i)._1, indices(i)._2)
-        }
-      case None =>
-        Array.tabulate[Partition](numPreShufflePartitions) { i =>
-          new ShuffledRowRDDPartition(i, i, i + 1)
+    assert(partitionStartIndices.length == part.numPartitions)
+    Array.tabulate[Partition](partitionStartIndices.length) { i =>
+      val startIndex = partitionStartIndices(i)
+      val endIndex =
+        if (i < partitionStartIndices.length - 1) {
+          partitionStartIndices(i + 1)
+        } else {
+          numPreShufflePartitions
         }
+      new ShuffledRowRDDPartition(i, startIndex, endIndex)
     }
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.{Dependency, MapOutputTrackerMaster, Partition, ShuffleDependency, SparkEnv, TaskContext}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter}
+
+sealed trait ShufflePartitionSpec
+
+// A partition that reads data of one reducer.
+case class SinglePartitionSpec(reducerIndex: Int) extends ShufflePartitionSpec
+
+// A partition that reads data of multiple reducers, from `startReducerIndex` (inclusive) to
+// `endReducerIndex` (exclusive).
+case class CoalescedPartitionSpec(
+    startReducerIndex: Int, endReducerIndex: Int) extends ShufflePartitionSpec
+
+// A partition that reads partial data of one reducer, from `startMapIndex` (inclusive) to
+// `endMapIndex` (exclusive).
+case class PartialPartitionSpec(
+    reducerIndex: Int, startMapIndex: Int, endMapIndex: Int) extends ShufflePartitionSpec
+
+private final case class CustomShufflePartition(
+    index: Int, spec: ShufflePartitionSpec) extends Partition
+
+// TODO: merge this with `ShuffledRowRDD`, and replace `LocalShuffledRowRDD` with this RDD.
+class CustomShuffledRowRDD(
+    var dependency: ShuffleDependency[Int, InternalRow, InternalRow],
+    metrics: Map[String, SQLMetric],
+    partitionSpecs: Array[ShufflePartitionSpec])
+  extends RDD[InternalRow](dependency.rdd.context, Nil) {
+
+  override def getDependencies: Seq[Dependency[_]] = List(dependency)
+
+  override def clearDependencies() {
+    super.clearDependencies()
+    dependency = null
+  }
+
+  override def getPartitions: Array[Partition] = {
+    Array.tabulate[Partition](partitionSpecs.length) { i =>
+      CustomShufflePartition(i, partitionSpecs(i))
+    }
+  }
+
+  override def getPreferredLocations(partition: Partition): Seq[String] = {
+    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    partition.asInstanceOf[CustomShufflePartition].spec match {
+      case SinglePartitionSpec(reducerIndex) =>
+        tracker.getPreferredLocationsForShuffle(dependency, reducerIndex)
+
+      case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) =>
+        startReducerIndex.until(endReducerIndex).flatMap { reducerIndex =>
+          tracker.getPreferredLocationsForShuffle(dependency, reducerIndex)
+        }
+
+      case PartialPartitionSpec(_, startMapIndex, endMapIndex) =>
+        tracker.getMapLocation(dependency, startMapIndex, endMapIndex)
+    }
+  }
+
+  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
+    val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics()
+    // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator,
+    // as well as the `tempMetrics` for basic shuffle metrics.
+    val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics)
+    val reader = split.asInstanceOf[CustomShufflePartition].spec match {
+      case SinglePartitionSpec(reducerIndex) =>
+        SparkEnv.get.shuffleManager.getReader(
+          dependency.shuffleHandle,
+          reducerIndex,
+          reducerIndex + 1,
+          context,
+          sqlMetricsReporter)
+
+      case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) =>
+        SparkEnv.get.shuffleManager.getReader(
+          dependency.shuffleHandle,
+          startReducerIndex,
+          endReducerIndex,
+          context,
+          sqlMetricsReporter)
+
+      case PartialPartitionSpec(reducerIndex, startMapIndex, endMapIndex) =>
+        SparkEnv.get.shuffleManager.getReaderForRange(
+          dependency.shuffleHandle,
+          startMapIndex,
+          endMapIndex,
+          reducerIndex,
+          reducerIndex + 1,
+          context,
+          sqlMetricsReporter)
+    }
+    reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2)
+  }
+}
diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala b/...e/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala
@@ -71,7 +71,7 @@ case class OptimizeLocalShuffleReader(conf: SQLConf) extends Rule[SparkPlan] {
     plan match {
       case c @ CoalescedShuffleReaderExec(s: ShuffleQueryStageExec, _) =>
         LocalShuffleReaderExec(
-          s, getPartitionStartIndices(s, Some(c.partitionIndices.length)))
+          s, getPartitionStartIndices(s, Some(c.partitionStartIndices.length)))
       case s: ShuffleQueryStageExec =>
         LocalShuffleReaderExec(s, getPartitionStartIndices(s, None))
     }