From 04c443cc039814d0e1f2ab2ec636ad00256d63fc Mon Sep 17 00:00:00 2001
From: lazyman <lazyman500@gmail.com>
Date: Tue, 17 Mar 2015 10:56:53 +0800
Subject: [PATCH] SPARK-5068: fix bug when partition path doesn't exists #2

---
 .../apache/spark/sql/hive/TableReader.scala   | 34 +++++++++-
 .../spark/sql/hive/QueryPartitionSuite.scala  | 64 +++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index f22c9eaeedc7d..6963e7b6bc6d9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -142,7 +142,39 @@ class HadoopTableReader(
       partitionToDeserializer: Map[HivePartition,
       Class[_ <: Deserializer]],
       filterOpt: Option[PathFilter]): RDD[Row] = {
-    val hivePartitionRDDs = partitionToDeserializer.map { case (partition, partDeserializer) =>
+    // SPARK-5068:get FileStatus and do the filtering locally when the path is not exists
+
+      var existPathSet =collection.mutable.Set[String]()
+      var pathPatternSet = collection.mutable.Set[String]()
+
+     val hivePartitionRDDs = partitionToDeserializer.filter {
+            case (partition, partDeserializer) =>
+
+            def updateExistPathSetByPathPattern(pathPatternStr:String ){
+              val pathPattern = new Path(pathPatternStr)
+              val fs = pathPattern.getFileSystem(sc.hiveconf)
+              val matchs = fs.globStatus(pathPattern);
+              matchs.map( fileStatus =>(existPathSet+= fileStatus.getPath.toString))
+            }
+            // convert  /demo/data/year/month/day  to  /demo/data/**/**/**/
+            def getPathPatternByPath(parNum:Int,tpath:Path):String = {
+              var path  = tpath
+              for (i <- (1 to parNum)) { path = path.getParent }
+              val tails = (1 to parNum).map(_ => "*").mkString("/","/","/")
+              path.toString + tails
+            }
+
+            val partPath = HiveShim.getDataLocationPath(partition)
+            val partNum = Utilities.getPartitionDesc(partition).getPartSpec.size();
+            var pathPatternStr = getPathPatternByPath(partNum,partPath)
+            if(!pathPatternSet.contains(pathPatternStr)){
+              pathPatternSet+=pathPatternStr
+              updateExistPathSetByPathPattern(pathPatternStr)
+            }
+              existPathSet.contains(partPath.toString)
+
+       }
+      .map { case (partition, partDeserializer) =>
       val partDesc = Utilities.getPartitionDesc(partition)
       val partPath = HiveShim.getDataLocationPath(partition)
       val inputPathStr = applyFilterIfNeeded(partPath, filterOpt)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
new file mode 100644
index 0000000000000..8b90bfb19fde8
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.File
+
+import com.google.common.io.Files
+import org.apache.spark.sql.{QueryTest, _}
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.util.Utils
+/* Implicits */
+import org.apache.spark.sql.hive.test.TestHive._
+
+
+
+class QueryPartitionSuite extends QueryTest {
+  import org.apache.spark.sql.hive.test.TestHive.implicits._
+
+  test("SPARK-5068: query data when path doesn't exists"){
+    val testData = TestHive.sparkContext.parallelize(
+      (1 to 10).map(i => TestData(i, i.toString))).toDF()
+    testData.registerTempTable("testData")
+
+    val tmpDir = Files.createTempDir()
+    //create the table for test
+    sql(s"CREATE TABLE table_with_partition(key int,value string) PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
+    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') SELECT key,value FROM testData")
+    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') SELECT key,value FROM testData")
+    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') SELECT key,value FROM testData")
+    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') SELECT key,value FROM testData")
+
+    //test for the exist path
+    checkAnswer(sql("select key,value from table_with_partition"),
+      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
+        ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect)
+
+    //delect the path of one partition
+    val folders = tmpDir.listFiles.filter(_.isDirectory)
+    Utils.deleteRecursively(folders(0))
+
+    //test for affter delete the path
+    checkAnswer(sql("select key,value from table_with_partition"),
+      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
+        ++ testData.toSchemaRDD.collect)
+
+    sql("DROP TABLE table_with_partition")
+    sql("DROP TABLE createAndInsertTest")
+  }
+}
\ No newline at end of file