apache · viirya · Jun 7, 2020 · Jun 9, 2020 · Jun 9, 2020 · Jun 10, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2080,9 +2080,9 @@ object SQLConf {
       .doc("A comma-separated list of data source short names or fully qualified data source " +
         "implementation class names for which Spark tries to push down predicates for nested " +
         "columns and/or names containing `dots` to data sources. This configuration is only " +
-        "effective with file-based data source in DSv1. Currently, Parquet implements " +
-        "both optimizations while ORC only supports predicates for names containing `dots`. The " +
-        "other data sources don't support this feature yet. So the default value is 'parquet,orc'.")
+        "effective with file-based data source in DSv1. Currently, Parquet and ORC implement " +
+        "both optimizations. The other data sources don't support this feature yet. So the " +
+        "default value is 'parquet,orc'.")
       .version("3.0.0")
       .stringConf
       .createWithDefault("parquet,orc")

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -668,6 +668,8 @@ abstract class PushableColumnBase {
     import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
     def helper(e: Expression): Option[Seq[String]] = e match {
       case a: Attribute =>
+        // Attribute that contains dot "." in name is supported only when
+        // nested predicate pushdown is enabled.
         if (nestedPredicatePushdownEnabled || !a.name.contains(".")) {
           Some(Seq(a.name))
         } else {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFiltersBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFiltersBase.scala
@@ -17,8 +17,11 @@
 
 package org.apache.spark.sql.execution.datasources.orc
 
+import java.util.Locale
+
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.sources.{And, Filter}
-import org.apache.spark.sql.types.{AtomicType, BinaryType, DataType}
+import org.apache.spark.sql.types.{AtomicType, BinaryType, DataType, StructField, StructType}
 
 /**
  * Methods that can be shared when upgrading the built-in Hive.
@@ -37,12 +40,44 @@ trait OrcFiltersBase {
   }
 
   /**
-   * Return true if this is a searchable type in ORC.
-   * Both CharType and VarcharType are cleaned at AstBuilder.
+   * This method returns a map which contains ORC field name and data type. Each key
+   * represents a column; `dots` are used as separators for nested columns. If any part
+   * of the names contains `dots`, it is quoted to avoid confusion. See
+   * `org.apache.spark.sql.connector.catalog.quote` for implementation details.
    */
-  protected[sql] def isSearchableType(dataType: DataType) = dataType match {
-    case BinaryType => false
-    case _: AtomicType => true
-    case _ => false
+  protected[sql] def getNameToOrcFieldMap(
+      schema: StructType,
+      caseSensitive: Boolean): Map[String, DataType] = {
+    import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
+
+    def getPrimitiveFields(
+        fields: Seq[StructField],
+        parentFieldNames: Array[String] = Array.empty): Seq[(String, DataType)] = {
+      fields.flatMap { f =>
+        f.dataType match {
+          case st: StructType =>
+            getPrimitiveFields(st.fields.toSeq, parentFieldNames :+ f.name)
+          case BinaryType => None
+          case _: AtomicType =>
+            Some(((parentFieldNames :+ f.name).toSeq.quoted, f.dataType))
+          case _ => None
+        }
+      }
+    }
+
+    val primitiveFields = getPrimitiveFields(schema.fields)
+    if (caseSensitive) {
+      primitiveFields.toMap
+    } else {
+      // Don't consider ambiguity here, i.e. more than one field is matched in case insensitive
+      // mode, just skip pushdown for these fields, they will trigger Exception when reading,
+      // See: SPARK-25175.
+      val dedupPrimitiveFields =
+      primitiveFields
+        .groupBy(_._1.toLowerCase(Locale.ROOT))
+        .filter(_._2.size == 1)
+        .mapValues(_.head._2)
+      CaseInsensitiveMap(dedupPrimitiveFields)
+    }
   }
 }
diff --git a/...ore/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala b/...ore/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters}
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
 import org.apache.spark.sql.execution.datasources.orc.OrcFilters
 import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
@@ -60,10 +61,8 @@ case class OrcScanBuilder(
         // changed `hadoopConf` in executors.
         OrcInputFormat.setSearchArgument(hadoopConf, f, schema.fieldNames)
       }
-      val dataTypeMap = schema.map(f => quoteIfNeeded(f.name) -> f.dataType).toMap
-      // TODO (SPARK-25557): ORC doesn't support nested predicate pushdown, so they are removed.
-      val newFilters = filters.filter(!_.containsNestedColumn)
-      _pushedFilters = OrcFilters.convertibleFilters(schema, dataTypeMap, newFilters).toArray
+      val dataTypeMap = OrcFilters.getNameToOrcFieldMap(schema, SQLConf.get.caseSensitiveAnalysis)
+      _pushedFilters = OrcFilters.convertibleFilters(schema, dataTypeMap, filters).toArray
     }
     filters
   }

diff --git a/...src/test/scala/org/apache/spark/sql/execution/datasources/NestedColumnPredicateTest.scala b/...src/test/scala/org/apache/spark/sql/execution/datasources/NestedColumnPredicateTest.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.struct
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Defines common stuff for nested column predicate pushdown test, e.g. `ParquetFilterSuite`.
+ */
+trait NestedColumnPredicateTest {
+  /**
+   * Takes single level `inputDF` dataframe to generate multi-level nested
+   * dataframes as new test data.
+   *
+   * This method accepts a function to run test. The given function takes three
+   * parameters: a DataFrame which ranges from zero-nested to multi-level nested,
+   * a string of the primitive field name, and a function that produces expected
+   * result of collected column.
+   */
+  protected def withNestedDataFrame(inputDF: DataFrame)
+      (runTest: (DataFrame, String, Any => Any) => Unit): Unit = {
+    assert(inputDF.schema.fields.length == 1)
+    assert(!inputDF.schema.fields.head.dataType.isInstanceOf[StructType])
+    val df = inputDF.toDF("temp")
+    Seq(
+      (
+        df.withColumnRenamed("temp", "a"),
+        "a", // zero nesting
+        (x: Any) => x),
+      (
+        df.withColumn("a", struct(df("temp") as "b")).drop("temp"),
+        "a.b", // one level nesting
+        (x: Any) => Row(x)),
+      (
+        df.withColumn("a", struct(struct(df("temp") as "c") as "b")).drop("temp"),
+        "a.b.c", // two level nesting
+        (x: Any) => Row(Row(x))
+      ),
+      (
+        df.withColumnRenamed("temp", "a.b"),
+        "`a.b`", // zero nesting with column name containing `dots`
+        (x: Any) => x
+      ),
+      (
+        df.withColumn("a.b", struct(df("temp") as "c.d") ).drop("temp"),
+        "`a.b`.`c.d`", // one level nesting with column names containing `dots`
+        (x: Any) => Row(x)
+      )
+    ).foreach { case (df, colName, resultFun) =>
+      runTest(df, colName, resultFun)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
@@ -78,12 +78,16 @@ abstract class OrcTest extends QueryTest with FileBasedDataSourceTest with Befor
       (f: String => Unit): Unit = withDataSourceFile(data)(f)
 
   /**
-   * Writes `data` to a Orc file and reads it back as a `DataFrame`,
+   * Writes `df` dataframe to a Orc file and reads it back as a `DataFrame`,
    * which is then passed to `f`. The Orc file will be deleted after `f` returns.
    */
-  protected def withOrcDataFrame[T <: Product: ClassTag: TypeTag]
-      (data: Seq[T], testVectorized: Boolean = true)
-      (f: DataFrame => Unit): Unit = withDataSourceDataFrame(data, testVectorized)(f)
+  protected def withOrcDataFrame(df: DataFrame, testVectorized: Boolean = true)
+      (f: DataFrame => Unit): Unit = {
+    withTempPath { file =>
+      df.write.format(dataSourceName).save(file.getCanonicalPath)
+      readFile(file.getCanonicalPath, testVectorized)(f)
+    }
+  }
 
   /**
    * Writes `data` to a Orc file, reads it back as a `DataFrame` and registers it as a

diff --git a/...core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1FilterSuite.scala b/...core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV1FilterSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.orc
 import scala.collection.JavaConverters._
 
 import org.apache.spark.SparkConf
-import org.apache.spark.sql.{Column, DataFrame}
+import org.apache.spark.sql.{Column, DataFrame, Row}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Predicate}
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation

diff --git a/...rc/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/...rc/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.parseColumnPath
-import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation, PushableColumnAndNestedColumn}
+import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation, NestedColumnPredicateTest, PushableColumnAndNestedColumn}
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.functions._
@@ -62,7 +62,8 @@ import org.apache.spark.util.{AccumulatorContext, AccumulatorV2}
  * dependent on this configuration, don't forget you better explicitly set this configuration
  * within the test.
  */
-abstract class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSparkSession {
+abstract class ParquetFilterSuite
+  extends QueryTest with ParquetTest with NestedColumnPredicateTest with SharedSparkSession {
 
   protected def createParquetFilters(
       schema: MessageType,
@@ -105,44 +106,6 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
     checkFilterPredicate(predicate, filterClass, Seq(Row(expected)))(df)
   }
 
-  /**
-   * Takes single level `inputDF` dataframe to generate multi-level nested
-   * dataframes as new test data.
-   */
-  private def withNestedDataFrame(inputDF: DataFrame)
-      (runTest: (DataFrame, String, Any => Any) => Unit): Unit = {
-    assert(inputDF.schema.fields.length == 1)
-    assert(!inputDF.schema.fields.head.dataType.isInstanceOf[StructType])
-    val df = inputDF.toDF("temp")
-    Seq(
-      (
-        df.withColumnRenamed("temp", "a"),
-        "a", // zero nesting
-        (x: Any) => x),
-      (
-        df.withColumn("a", struct(df("temp") as "b")).drop("temp"),
-        "a.b", // one level nesting
-        (x: Any) => Row(x)),
-      (
-        df.withColumn("a", struct(struct(df("temp") as "c") as "b")).drop("temp"),
-        "a.b.c", // two level nesting
-        (x: Any) => Row(Row(x))
-      ),
-      (
-        df.withColumnRenamed("temp", "a.b"),
-        "`a.b`", // zero nesting with column name containing `dots`
-        (x: Any) => x
-      ),
-      (
-        df.withColumn("a.b", struct(df("temp") as "c.d") ).drop("temp"),
-        "`a.b`.`c.d`", // one level nesting with column names containing `dots`
-        (x: Any) => Row(x)
-      )
-    ).foreach { case (df, colName, resultFun) =>
-      runTest(df, colName, resultFun)
-    }
-  }
-
   private def testTimestampPushdown(data: Seq[String], java8Api: Boolean): Unit = {
     implicit class StringToTs(s: String) {
       def ts: Timestamp = Timestamp.valueOf(s)

diff --git a/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
@@ -27,7 +27,7 @@ import org.apache.orc.storage.serde2.io.HiveDecimalWritable
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp}
-import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.quoteIfNeeded
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types._
 
@@ -68,11 +68,9 @@ private[sql] object OrcFilters extends OrcFiltersBase {
    * Create ORC filter as a SearchArgument instance.
    */
   def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = {
-    val dataTypeMap = schema.map(f => quoteIfNeeded(f.name) -> f.dataType).toMap
+    val dataTypeMap = OrcFilters.getNameToOrcFieldMap(schema, SQLConf.get.caseSensitiveAnalysis)
     // Combines all convertible filters using `And` to produce a single conjunction
-    // TODO (SPARK-25557): ORC doesn't support nested predicate pushdown, so they are removed.
-    val newFilters = filters.filter(!_.containsNestedColumn)
-    val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, newFilters))
+    val conjunctionOptional = buildTree(convertibleFilters(schema, dataTypeMap, filters))
     conjunctionOptional.map { conjunction =>
       // Then tries to build a single ORC `SearchArgument` for the conjunction predicate.
       // The input predicate is fully convertible. There should not be any empty result in the
@@ -231,37 +229,37 @@ private[sql] object OrcFilters extends OrcFiltersBase {
     // Since ORC 1.5.0 (ORC-323), we need to quote for column names with `.` characters
     // in order to distinguish predicate pushdown for nested columns.
     expression match {
-      case EqualTo(name, value) if isSearchableType(dataTypeMap(name)) =>
+      case EqualTo(name, value) if dataTypeMap.contains(name) =>
         val castedValue = castLiteralValue(value, dataTypeMap(name))
         Some(builder.startAnd().equals(name, getType(name), castedValue).end())
 
-      case EqualNullSafe(name, value) if isSearchableType(dataTypeMap(name)) =>
+      case EqualNullSafe(name, value) if dataTypeMap.contains(name) =>
         val castedValue = castLiteralValue(value, dataTypeMap(name))
         Some(builder.startAnd().nullSafeEquals(name, getType(name), castedValue).end())
 
-      case LessThan(name, value) if isSearchableType(dataTypeMap(name)) =>
+      case LessThan(name, value) if dataTypeMap.contains(name) =>
         val castedValue = castLiteralValue(value, dataTypeMap(name))
         Some(builder.startAnd().lessThan(name, getType(name), castedValue).end())
 
-      case LessThanOrEqual(name, value) if isSearchableType(dataTypeMap(name)) =>
+      case LessThanOrEqual(name, value) if dataTypeMap.contains(name) =>
         val castedValue = castLiteralValue(value, dataTypeMap(name))
         Some(builder.startAnd().lessThanEquals(name, getType(name), castedValue).end())
 
-      case GreaterThan(name, value) if isSearchableType(dataTypeMap(name)) =>
+      case GreaterThan(name, value) if dataTypeMap.contains(name) =>
         val castedValue = castLiteralValue(value, dataTypeMap(name))
         Some(builder.startNot().lessThanEquals(name, getType(name), castedValue).end())
 
-      case GreaterThanOrEqual(name, value) if isSearchableType(dataTypeMap(name)) =>
+      case GreaterThanOrEqual(name, value) if dataTypeMap.contains(name) =>
         val castedValue = castLiteralValue(value, dataTypeMap(name))
         Some(builder.startNot().lessThan(name, getType(name), castedValue).end())
 
-      case IsNull(name) if isSearchableType(dataTypeMap(name)) =>
+      case IsNull(name) if dataTypeMap.contains(name) =>
         Some(builder.startAnd().isNull(name, getType(name)).end())
 
-      case IsNotNull(name) if isSearchableType(dataTypeMap(name)) =>
+      case IsNotNull(name) if dataTypeMap.contains(name) =>
         Some(builder.startNot().isNull(name, getType(name)).end())
 
-      case In(name, values) if isSearchableType(dataTypeMap(name)) =>
+      case In(name, values) if dataTypeMap.contains(name) =>
         val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(name)))
         Some(builder.startAnd().in(name, getType(name),
           castedValues.map(_.asInstanceOf[AnyRef]): _*).end())