From 57948c865e064469a75c92f8b58c632b9b40fdd3 Mon Sep 17 00:00:00 2001 From: Johan Lasperas Date: Thu, 16 May 2024 22:38:02 +0800 Subject: [PATCH] [SPARK-48308][CORE] Unify getting data schema without partition columns in FileSourceStrategy ### What changes were proposed in this pull request? Compute the schema of the data without partition columns only once in FileSourceStrategy. ### Why are the changes needed? In FileSourceStrategy, the schema of the data excluding partition columns is computed 2 times in a slightly different way, using an AttributeSet (`partitionSet`) and using the attributes directly (`partitionColumns`) These don't have the exact same semantics, AttributeSet will only use expression ids for comparison while comparing with the actual attributes will use the name, type, nullability and metadata. We want to use the former here. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #46619 from johanl-db/reuse-schema-without-partition-columns. Authored-by: Johan Lasperas Signed-off-by: Wenchen Fan --- .../spark/sql/execution/datasources/FileSourceStrategy.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 8333c276cdd8e..d31cb111924b3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -216,9 +216,8 @@ object FileSourceStrategy extends Strategy with PredicateHelper with Logging { val requiredExpressions: Seq[NamedExpression] = filterAttributes.toSeq ++ projects val requiredAttributes = AttributeSet(requiredExpressions) - val readDataColumns = dataColumns + val readDataColumns = dataColumnsWithoutPartitionCols .filter(requiredAttributes.contains) - .filterNot(partitionColumns.contains) // Metadata attributes are part of a column of type struct up to this point. Here we extract // this column from the schema and specify a matcher for that.