From a29df5fa02111f57965be2ab5e208f5c815265fe Mon Sep 17 00:00:00 2001 From: CodeGod <> Date: Sat, 9 Mar 2019 21:28:10 +0800 Subject: [PATCH] [SPARK-27080][SQL] bug fix: mergeWithMetastoreSchema with uniform lower case comparison ## What changes were proposed in this pull request? When reading parquet file with merging metastore schema and file schema, we should compare field names using uniform case. In current implementation, lowercase is used but one omission. And this patch fix it. ## How was this patch tested? Unit test Closes #24001 from codeborui/mergeSchemaBugFix. Authored-by: CodeGod <> Signed-off-by: Wenchen Fan --- .../spark/sql/hive/HiveMetastoreCatalog.scala | 2 +- .../sql/hive/HiveSchemaInferenceSuite.scala | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 03f4b8d83e353..d6b2945b2ea7a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -326,8 +326,8 @@ private[hive] object HiveMetastoreCatalog { // Merge missing nullable fields to inferred schema and build a case-insensitive field map. val inferredFields = StructType(inferredSchema ++ missingNullables) .map(f => f.name.toLowerCase -> f).toMap + StructType(metastoreSchema.map(f => f.copy(name = inferredFields(f.name.toLowerCase).name))) // scalastyle:on caselocale - StructType(metastoreSchema.map(f => f.copy(name = inferredFields(f.name).name))) } catch { case NonFatal(_) => val msg = s"""Detected conflicting schemas when merging the schema obtained from the Hive diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala index aa4fc13333c48..590ef949ffbd7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSchemaInferenceSuite.scala @@ -264,6 +264,32 @@ class HiveSchemaInferenceSuite StructType(Seq(StructField("lowerCase", BinaryType)))) } + // Parquet schema is subset of metaStore schema and has uppercase field name + assertResult( + StructType(Seq( + StructField("UPPERCase", DoubleType, nullable = true), + StructField("lowerCase", BinaryType, nullable = true)))) { + + HiveMetastoreCatalog.mergeWithMetastoreSchema( + StructType(Seq( + StructField("UPPERCase", DoubleType, nullable = true), + StructField("lowerCase", BinaryType, nullable = true))), + + StructType(Seq( + StructField("lowerCase", BinaryType, nullable = true)))) + } + + // Metastore schema contains additional nullable fields. + assert(intercept[Throwable] { + HiveMetastoreCatalog.mergeWithMetastoreSchema( + StructType(Seq( + StructField("UPPERCase", DoubleType, nullable = false), + StructField("lowerCase", BinaryType, nullable = true))), + + StructType(Seq( + StructField("lowerCase", BinaryType, nullable = true)))) + }.getMessage.contains("Detected conflicting schemas")) + // Check that merging missing nullable fields works as expected. assertResult( StructType(Seq(