apache · wangyum · Mar 29, 2021 · Apr 10, 2021 · Apr 11, 2021 · Apr 16, 2021
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
@@ -21,8 +21,9 @@ import org.apache.spark.sql.types._
 
 object SchemaPruning {
   /**
-   * Filters the schema by the requested fields. For example, if the schema is struct<a:int, b:int>,
-   * and given requested field are "a", the field "b" is pruned in the returned schema.
+   * Prunes the nested schema by the requested fields. For example, if the schema is
+   * struct<a:int, b:int>, and given requested field are "a", the field "b" is pruned in the
+   * returned schema.
    * Note that schema field ordering at original schema is still preserved in pruned schema.
    */
   def pruneDataSchema(
@@ -32,11 +33,10 @@ object SchemaPruning {
     // in the resulting schema may differ from their ordering in the logical relation's
     // original schema
     val mergedSchema = requestedRootFields
-      .map { case root: RootField => StructType(Array(root.field)) }
+      .map { root: RootField => StructType(Array(root.field)) }
       .reduceLeft(_ merge _)
-    val dataSchemaFieldNames = dataSchema.fieldNames.toSet
     val mergedDataSchema =
-      StructType(mergedSchema.filter(f => dataSchemaFieldNames.contains(f.name)))
+      StructType(dataSchema.map(s => mergedSchema.find(_.name.equals(s.name)).getOrElse(s)))
 val readDataColumns = 
   dataColumns 
     .filter(requiredAttributes.contains) 
     .filterNot(partitionColumns.contains) 
 val outputSchema = readDataColumns.toStructType 
 logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}") 
 val neededFieldNames = neededOutput.map(_.name).toSet 
 r.pruneColumns(StructType(prunedSchema.filter(f => neededFieldNames.contains(f.name)))) 
 def pruneDataSchema( 
     dataSchema: StructType, 
     requestedRootFields: Seq[RootField]): StructType = { 
   // Merge the requested root fields into a single schema. Note the ordering of the fields 
   // in the resulting schema may differ from their ordering in the logical relation's 
   // original schema 
   val mergedSchema = requestedRootFields 
     .map { case root: RootField => StructType(Array(root.field)) } 
     .reduceLeft(_ merge _) 
   val dataSchemaFieldNames = dataSchema.fieldNames.toSet 
   val mergedDataSchema = 
     StructType(mergedSchema.filter(f => dataSchemaFieldNames.contains(f.name))) 
   // Sort the fields of mergedDataSchema according to their order in dataSchema, 
   // recursively. This makes mergedDataSchema a pruned schema of dataSchema 
   sortLeftFieldsByRight(mergedDataSchema, dataSchema).asInstanceOf[StructType] 
 } 
 if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) { 
   val prunedRelation = leafNodeBuilder(prunedDataSchema) 
   val projectionOverSchema = ProjectionOverSchema(prunedDataSchema) 
   Some(buildNewProjection(normalizedProjects, normalizedFilters, prunedRelation, 
     projectionOverSchema)) 
 val readDataColumns = 
   dataColumns 
     .filter(requiredAttributes.contains) 
     .filterNot(partitionColumns.contains) 
 val outputSchema = readDataColumns.toStructType 
 logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}") 
 val outputAttributes = readDataColumns ++ partitionColumns 
 val scan = 
   FileSourceScanExec( 
     fsRelation, 
     outputAttributes, 
     outputSchema, 
     partitionKeyFilters.toSeq, 
     bucketSet, 
     None, 
     dataFilters, 
     table.map(_.identifier)) 
 lazy val inputRDD: RDD[InternalRow] = { 
   val readFile: (PartitionedFile) => Iterator[InternalRow] = 
     relation.fileFormat.buildReaderWithPartitionValues( 
       sparkSession = relation.sparkSession, 
       dataSchema = relation.dataSchema, 
       partitionSchema = relation.partitionSchema, 
       requiredSchema = requiredSchema, 
       filters = pushedDownFilters, 
       options = relation.options, 
       hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options)) 
 val resultedColPruneInfo = 
   Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => 
     OrcUtils.requestedColumnIds( 
       isCaseSensitive, dataSchema, requiredSchema, reader, conf) 
   } 
 val readDataColumns = 
   dataColumns 
     .filter(requiredAttributes.contains) 
     .filterNot(partitionColumns.contains) 
 val outputSchema = readDataColumns.toStructType 
 logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}") 
 val neededFieldNames = neededOutput.map(_.name).toSet 
 r.pruneColumns(StructType(prunedSchema.filter(f => neededFieldNames.contains(f.name)))) 
 def pruneDataSchema( 
     dataSchema: StructType, 
     requestedRootFields: Seq[RootField]): StructType = { 
   // Merge the requested root fields into a single schema. Note the ordering of the fields 
   // in the resulting schema may differ from their ordering in the logical relation's 
   // original schema 
   val mergedSchema = requestedRootFields 
     .map { case root: RootField => StructType(Array(root.field)) } 
     .reduceLeft(_ merge _) 
   val dataSchemaFieldNames = dataSchema.fieldNames.toSet 
   val mergedDataSchema = 
     StructType(mergedSchema.filter(f => dataSchemaFieldNames.contains(f.name))) 
   // Sort the fields of mergedDataSchema according to their order in dataSchema, 
   // recursively. This makes mergedDataSchema a pruned schema of dataSchema 
   sortLeftFieldsByRight(mergedDataSchema, dataSchema).asInstanceOf[StructType] 
 } 
 if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) { 
   val prunedRelation = leafNodeBuilder(prunedDataSchema) 
   val projectionOverSchema = ProjectionOverSchema(prunedDataSchema) 
  
   Some(buildNewProjection(normalizedProjects, normalizedFilters, prunedRelation, 
     projectionOverSchema)) 
 val readDataColumns = 
   dataColumns 
     .filter(requiredAttributes.contains) 
     .filterNot(partitionColumns.contains) 
 val outputSchema = readDataColumns.toStructType 
 logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}") 
  
 val outputAttributes = readDataColumns ++ partitionColumns 
  
 val scan = 
   FileSourceScanExec( 
     fsRelation, 
     outputAttributes, 
     outputSchema, 
     partitionKeyFilters.toSeq, 
     bucketSet, 
     None, 
     dataFilters, 
     table.map(_.identifier)) 
 lazy val inputRDD: RDD[InternalRow] = { 
   val readFile: (PartitionedFile) => Iterator[InternalRow] = 
     relation.fileFormat.buildReaderWithPartitionValues( 
       sparkSession = relation.sparkSession, 
       dataSchema = relation.dataSchema, 
       partitionSchema = relation.partitionSchema, 
       requiredSchema = requiredSchema, 
       filters = pushedDownFilters, 
       options = relation.options, 
       hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options)) 
 val resultedColPruneInfo = 
   Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => 
     OrcUtils.requestedColumnIds( 
       isCaseSensitive, dataSchema, requiredSchema, reader, conf) 
   } 
     // Sort the fields of mergedDataSchema according to their order in dataSchema,
     // recursively. This makes mergedDataSchema a pruned schema of dataSchema
     sortLeftFieldsByRight(mergedDataSchema, dataSchema).asInstanceOf[StructType]

diff --git a/...atalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala b/...atalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala
@@ -24,22 +24,28 @@ class SchemaPruningSuite extends SparkFunSuite {
   test("prune schema by the requested fields") {
     def testPrunedSchema(
         schema: StructType,
-        requestedFields: StructField*): Unit = {
+        requestedFields: Seq[StructField],
+        expectedSchema: StructType): Unit = {
       val requestedRootFields = requestedFields.map { f =>
         // `derivedFromAtt` doesn't affect the result of pruned schema.
         SchemaPruning.RootField(field = f, derivedFromAtt = true)
       }
-      val expectedSchema = SchemaPruning.pruneDataSchema(schema, requestedRootFields)
-      assert(expectedSchema == StructType(requestedFields))
+      val prunedSchema = SchemaPruning.pruneDataSchema(schema, requestedRootFields)
+      assert(prunedSchema === expectedSchema)
     }
 
-    testPrunedSchema(StructType.fromDDL("a int, b int"), StructField("a", IntegerType))
-    testPrunedSchema(StructType.fromDDL("a int, b int"), StructField("b", IntegerType))
+    testPrunedSchema(
+      StructType.fromDDL("a int, b int"),
+      Seq(StructField("a", IntegerType)),
+      StructType.fromDDL("a int, b int"))
 
     val structOfStruct = StructType.fromDDL("a struct<a:int, b:int>, b int")
-    testPrunedSchema(structOfStruct, StructField("a", StructType.fromDDL("a int, b int")))
-    testPrunedSchema(structOfStruct, StructField("b", IntegerType))
-    testPrunedSchema(structOfStruct, StructField("a", StructType.fromDDL("b int")))
+    testPrunedSchema(structOfStruct,
+      Seq(StructField("a", StructType.fromDDL("a int")), StructField("b", IntegerType)),
+      StructType.fromDDL("a struct<a:int>, b int"))
+    testPrunedSchema(structOfStruct,
+      Seq(StructField("a", StructType.fromDDL("a int"))),
+      StructType.fromDDL("a struct<a:int>, b int"))
 
     val arrayOfStruct = StructField("a", ArrayType(StructType.fromDDL("a int, b int, c string")))
     val mapOfStruct = StructField("d", MapType(StructType.fromDDL("a int, b int, c string"),
@@ -49,14 +55,31 @@ class SchemaPruningSuite extends SparkFunSuite {
       arrayOfStruct :: StructField("b", structOfStruct) :: StructField("c", IntegerType) ::
         mapOfStruct :: Nil)
 
-    testPrunedSchema(complexStruct, StructField("a", ArrayType(StructType.fromDDL("b int"))),
-      StructField("b", StructType.fromDDL("a int")))
     testPrunedSchema(complexStruct,
-      StructField("a", ArrayType(StructType.fromDDL("b int, c string"))),
-      StructField("b", StructType.fromDDL("b int")))
+      Seq(StructField("a", ArrayType(StructType.fromDDL("b int"))),
+        StructField("b", StructType.fromDDL("a int"))),
+      StructType(
+        StructField("a", ArrayType(StructType.fromDDL("b int"))) ::
+          StructField("b", StructType.fromDDL("a int")) ::
+          StructField("c", IntegerType) ::
+          mapOfStruct :: Nil))
+    testPrunedSchema(complexStruct,
+      Seq(StructField("a", ArrayType(StructType.fromDDL("b int, c string"))),
+        StructField("b", StructType.fromDDL("b int"))),
+      StructType(
+        StructField("a", ArrayType(StructType.fromDDL("b int, c string"))) ::
+          StructField("b", StructType.fromDDL("b int")) ::
+          StructField("c", IntegerType) ::
+          mapOfStruct :: Nil))
 
     val selectFieldInMap = StructField("d", MapType(StructType.fromDDL("a int, b int"),
       StructType.fromDDL("e int, f string")))
-    testPrunedSchema(complexStruct, StructField("c", IntegerType), selectFieldInMap)
+    testPrunedSchema(complexStruct,
+      Seq(StructField("c", IntegerType), selectFieldInMap),
+      StructType(
+        arrayOfStruct ::
+          StructField("b", structOfStruct) ::
+          StructField("c", IntegerType) ::
+          selectFieldInMap :: Nil))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala
@@ -81,6 +81,10 @@ object PushDownUtils extends PredicateHelper {
       relation: DataSourceV2Relation,
       projects: Seq[NamedExpression],
       filters: Seq[Expression]): (Scan, Seq[AttributeReference]) = {
+    val exprs = projects ++ filters
+    val requiredColumns = AttributeSet(exprs.flatMap(_.references))
+    val neededOutput = relation.output.filter(requiredColumns.contains)
+
     scanBuilder match {
       case r: SupportsPushDownRequiredColumns if SQLConf.get.nestedSchemaPruningEnabled =>
         val rootFields = SchemaPruning.identifyRootFields(projects, filters)
@@ -89,14 +93,12 @@ object PushDownUtils extends PredicateHelper {
         } else {
           new StructType()
         }
-        r.pruneColumns(prunedSchema)
+        val neededFieldNames = neededOutput.map(_.name).toSet
+        r.pruneColumns(StructType(prunedSchema.filter(f => neededFieldNames.contains(f.name))))
         val scan = r.build()
         scan -> toOutputAttrs(scan.readSchema(), relation)
 
       case r: SupportsPushDownRequiredColumns =>
-        val exprs = projects ++ filters
-        val requiredColumns = AttributeSet(exprs.flatMap(_.references))
-        val neededOutput = relation.output.filter(requiredColumns.contains)
         r.pruneColumns(neededOutput.toStructType)
         val scan = r.build()
         // always project, in case the relation's output has been updated and doesn't match

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -633,4 +633,20 @@ class OrcSourceSuite extends OrcSuite with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-34897: Support reconcile schemas based on index after nested column pruning") {
+    withTable("t1") {
+      spark.sql(
+        """
+          |CREATE TABLE t1 (
+          |  _col0 INT,
+          |  _col1 STRING,
+          |  _col2 STRUCT<c1: STRING, c2: STRING, c3: STRING, c4: BIGINT>)
+          |USING ORC
+          |""".stripMargin)
+
+      spark.sql("INSERT INTO t1 values(1, '2', struct('a', 'b', 'c', 10L))")
+      checkAnswer(spark.sql("SELECT _col0, _col2.c1 FROM t1"), Seq(Row(1, "a")))
+    }
+  }
 }