apache · sunchao · Jul 2, 2020 · Jul 2, 2020 · Jul 2, 2020 · Jul 6, 2020
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -443,6 +443,40 @@ package object config {
       .booleanConf
       .createWithDefault(false)
 
+  val PARALLEL_PARTITION_DISCOVERY_THRESHOLD =
+    ConfigBuilder("spark.rdd.sources.parallelPartitionDiscovery.threshold")
+      .doc("The maximum number of paths allowed for listing files at driver side. If the number " +
+        "of detected paths exceeds this value during partition discovery, it tries to list the " +
+        "files with another Spark distributed job. This configuration is effective only when " +
+        "using file-based using HadoopFSUtils")
+      .version("1.5.0")
+      .intConf
+      .checkValue(parallel => parallel >= 0, "The maximum number of paths allowed for listing " +
+        "files at driver side must not be negative")
+      .createWithDefault(2)
+
+  val PARALLEL_PARTITION_DISCOVERY_PARALLELISM =
+    ConfigBuilder("spark.rdd.sources.parallelPartitionDiscovery.parallelism")
+      .doc("The number of parallelism to list a collection of path recursively, Set the " +
+        "number to prevent file listing from generating too many tasks.")
+      .version("3.1.0")
+      .internal()
+      .intConf
+      .createWithDefault(10000)
+
+  val IGNORE_DATA_LOCALITY =
+    ConfigBuilder("spark.rdd.sources.ignoreDataLocality")
+      .doc("If true, Spark will not fetch the block locations for each file on " +
+        "listing files. This speeds up file listing, but the scheduler cannot " +
+        "schedule tasks to take advantage of data locality. It can be particularly " +
+        "useful if data is read from a remote cluster so the scheduler could never " +
+        "take advantage of locality anyway. This configuration is effective only when " +
+        "using file-based using HadoopFSUtils")
+      .version("3.1.0")
+      .internal()
+      .booleanConf
+      .createWithDefault(false)
+
   private[spark] val STORAGE_DECOMMISSION_MAX_REPLICATION_FAILURE_PER_BLOCK =
     ConfigBuilder("spark.storage.decommission.maxReplicationFailuresPerBlock")
       .internal()

diff --git a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.io.FileNotFoundException
+
+import scala.collection.mutable
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.fs.viewfs.ViewFileSystem
+import org.apache.hadoop.hdfs.DistributedFileSystem
+
+import org.apache.spark._
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+
+/**
+ * :: DeveloperApi ::
+ * Utility functions to simplify and speed-up file listing.
+ */
+@DeveloperApi
+object HadoopFSUtils extends Logging {
+  /**
+   * :: DeveloperApi ::
+   * Lists a collection of paths recursively. Picks the listing strategy adaptively depending
+   * on the number of paths to list.
+   *
+   * This may only be called on the driver.
+   *
+   * @return for each input path, the set of discovered files for the path
+   */
+  @DeveloperApi
+  def parallelListLeafFiles(sc: SparkContext, paths: Seq[Path], hadoopConf: Configuration,
+    filter: PathFilter, areSQLRootPaths: Boolean, ignoreMissingFiles: Boolean,
+    ignoreLocality: Boolean, maxParallelism: Int,
+    filterFun: Option[String => Boolean] = None): Seq[(Path, Seq[FileStatus])] = {
+    HiveCatalogMetrics.incrementParallelListingJobCount(1)
+
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, maxParallelism)
+
+    val previousJobDescription = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
+    val statusMap = try {
+      val description = paths.size match {
+        case 0 =>
+          s"Listing leaf files and directories 0 paths"
+        case 1 =>
+          s"Listing leaf files and directories for 1 path:<br/>${paths(0)}"
+        case s =>
+          s"Listing leaf files and directories for $s paths:<br/>${paths(0)}, ..."
+      }
+      sc.setJobDescription(description) // TODO(holden): should we use jobgroup?
+      sc
+        .parallelize(serializedPaths, numParallelism)
+        .mapPartitions { pathStrings =>
+          val hadoopConf = serializableConfiguration.value
+          pathStrings.map(new Path(_)).toSeq.map { path =>
+            val leafFiles = listLeafFiles(
+              contextOpt = None, // Can't execute parallel scans on workers
+              path = path,
+              hadoopConf = hadoopConf,
+              filter = filter,
+              ignoreMissingFiles = ignoreMissingFiles,
+              ignoreLocality = ignoreLocality,
+              isSQLRootPath = areSQLRootPaths,
+              filterFun = filterFun,
+              parallelismThreshold = Int.MaxValue,
+              maxParallelism = 0)
+            (path, leafFiles)
+          }.iterator
+        }.collect() // TODO(holden): should we use local itr here?
+    } finally {
+      sc.setJobDescription(previousJobDescription)
+    }
+
+    statusMap.toSeq
+  }
+  /**
+   * :: DeveloperApi ::
+   * Lists a single filesystem path recursively. If a Sparkcontext object is specified, this
+   * function may launch Spark jobs to parallelize listing based on parallelismThreshold.
+   *
+   * If sessionOpt is None, this may be called on executors.
+   *
+   * @return all children of path that match the specified filter.
+   */
+  // scalastyle:off argcount
+  @DeveloperApi
+  def listLeafFiles(
+      path: Path,
+      hadoopConf: Configuration,
+      filter: PathFilter,
+      contextOpt: Option[SparkContext],
+      ignoreMissingFiles: Boolean,
+      ignoreLocality: Boolean,
+      isSQLRootPath: Boolean,
+      filterFun: Option[String => Boolean],
+      parallelismThreshold: Int,
+      maxParallelism: Int): Seq[FileStatus] = {
+    // scalastyle:on argcount
+
+    logTrace(s"Listing $path")
+    val fs = path.getFileSystem(hadoopConf)
+
+    // Note that statuses only include FileStatus for the files and dirs directly under path,
+    // and does not include anything else recursively.
+    val statuses: Array[FileStatus] = try {
+      fs match {
+        // DistributedFileSystem overrides listLocatedStatus to make 1 single call to namenode
+        // to retrieve the file status with the file block location. The reason to still fallback
+        // to listStatus is because the default implementation would potentially throw a
+        // FileNotFoundException which is better handled by doing the lookups manually below.
+        case (_: DistributedFileSystem | _: ViewFileSystem) if !ignoreLocality =>
+          val remoteIter = fs.listLocatedStatus(path)
+          new Iterator[LocatedFileStatus]() {
+            def next(): LocatedFileStatus = remoteIter.next
+            def hasNext(): Boolean = remoteIter.hasNext
+          }.toArray
+        case _ => fs.listStatus(path)
+      }
+    } catch {
+      // If we are listing a root path for SQL (e.g. a top level directory of a table), we need to
+      // ignore FileNotFoundExceptions during this root level of the listing because
+      //
+      //  (a) certain code paths might construct an InMemoryFileIndex with root paths that
+      //      might not exist (i.e. not all callers are guaranteed to have checked
+      //      path existence prior to constructing InMemoryFileIndex) and,
+      //  (b) we need to ignore deleted root paths during REFRESH TABLE, otherwise we break
+      //      existing behavior and break the ability drop SessionCatalog tables when tables'
+      //      root directories have been deleted (which breaks a number of Spark's own tests).
+      //
+      // If we are NOT listing a root path then a FileNotFoundException here means that the
+      // directory was present in a previous level of file listing but is absent in this
+      // listing, likely indicating a race condition (e.g. concurrent table overwrite or S3
+      // list inconsistency).
+      //
+      // The trade-off in supporting existing behaviors / use-cases is that we won't be
+      // able to detect race conditions involving root paths being deleted during
+      // InMemoryFileIndex construction. However, it's still a net improvement to detect and
+      // fail-fast on the non-root cases. For more info see the SPARK-27676 review discussion.
+      case _: FileNotFoundException if isSQLRootPath || ignoreMissingFiles =>
+        logWarning(s"The directory $path was not found. Was it deleted very recently?")
+        Array.empty[FileStatus]
+    }
+
+    def doFilter(statuses: Array[FileStatus]) = filterFun match {
+      case Some(shouldFilterOut) =>
+        statuses.filterNot(status => shouldFilterOut(status.getPath.getName))
+      case None =>
+        statuses
+    }
+
+    val filteredStatuses = doFilter(statuses)
+    val allLeafStatuses = {
+      val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
+      val nestedFiles: Seq[FileStatus] = contextOpt match {
+        case Some(context) if dirs.size > parallelismThreshold =>
+          parallelListLeafFiles(
+            context,
+            dirs.map(_.getPath),
+            hadoopConf = hadoopConf,
+            filter = filter,
+            areSQLRootPaths = false,
+            ignoreMissingFiles = ignoreMissingFiles,
+            ignoreLocality = ignoreLocality,
+            filterFun = filterFun,
+            maxParallelism = maxParallelism
+          ).flatMap(_._2)
+        case _ =>
+          dirs.flatMap { dir =>
+            listLeafFiles(
+              path = dir.getPath,
+              hadoopConf = hadoopConf,
+              filter = filter,
+              contextOpt = contextOpt,
+              ignoreMissingFiles = ignoreMissingFiles,
+              ignoreLocality = ignoreLocality,
+              isSQLRootPath = false,
+              filterFun = filterFun,
+              parallelismThreshold = parallelismThreshold,
+              maxParallelism = maxParallelism)
+          }
+      }
+      val allFiles = topLevelFiles ++ nestedFiles
+      if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
+    }
+
+    val missingFiles = mutable.ArrayBuffer.empty[String]
+    val filteredLeafStatuses = doFilter(allLeafStatuses)
+    val resolvedLeafStatuses = filteredLeafStatuses.flatMap {
+      case f: LocatedFileStatus =>
+        Some(f)
+
+      // NOTE:
+      //
+      // - Although S3/S3A/S3N file system can be quite slow for remote file metadata
+      //   operations, calling `getFileBlockLocations` does no harm here since these file system
+      //   implementations don't actually issue RPC for this method.
+      //
+      // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
+      //   be a big deal since we always use to `parallelListLeafFiles` when the number of
+      //   paths exceeds threshold.
+      case f if !ignoreLocality =>
+        // The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
+        // which is very slow on some file system (RawLocalFileSystem, which is launch a
+        // subprocess and parse the stdout).
+        try {
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc =>
+            // Store BlockLocation objects to consume less memory
+            if (loc.getClass == classOf[BlockLocation]) {
+              loc
+            } else {
+              new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
+            }
+          }
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          Some(lfs)
+        } catch {
+          case _: FileNotFoundException if ignoreMissingFiles =>
+            missingFiles += f.getPath.toString
+            None
+        }
+
+      case f => Some(f)
+    }
+
+    if (missingFiles.nonEmpty) {
+      logWarning(
+        s"the following files were missing during file scan:\n  ${missingFiles.mkString("\n  ")}")
+    }
+
+    resolvedLeafStatuses.toSeq
+  }
+}