apache · westonpace · Mar 9, 2023 · May 30, 2023 · May 31, 2023 · Jun 2, 2023
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <memory>
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/options.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/file_parquet.h"
+#include "arrow/dataset/partition.h"
+#include "arrow/filesystem/mockfs.h"
+#include "arrow/testing/generator.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/matchers.h"
+
+#include "arrow/table.h"
+#include "arrow/util/key_value_metadata.h"
+
+namespace arrow {
+
+namespace acero {
+
+TEST(SinkNode, CustomFieldMetadata) {
+  // Create an input table with a nullable and a non-nullable type
+  ExecBatch batch = gen::Gen({gen::Step()})->FailOnError()->ExecBatch(/*num_rows=*/1);
+  std::shared_ptr<Schema> test_schema =
+      schema({field("nullable_i32", uint32(), /*nullable=*/true,
+                    key_value_metadata({{"foo", "bar"}})),
+              field("non_nullable_i32", uint32(), /*nullable=*/false)});
+  std::shared_ptr<RecordBatch> record_batch =
+      RecordBatch::Make(test_schema, /*num_rows=*/1,
+                        {batch.values[0].make_array(), batch.values[0].make_array()});
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> table,
+                       Table::FromRecordBatches({std::move(record_batch)}));
+
+  ASSERT_TRUE(table->field(0)->nullable());
+  ASSERT_EQ(1, table->field(0)->metadata()->keys().size());
+  ASSERT_FALSE(table->field(1)->nullable());
+  ASSERT_EQ(0, table->field(1)->metadata()->keys().size());
+
+  Declaration plan = Declaration::Sequence(
+      {{"table_source", TableSourceNodeOptions(std::move(table))},
+       {"project", ProjectNodeOptions({compute::field_ref(0), compute::field_ref(1)})}});
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> out_table, DeclarationToTable(plan));
+
+  ASSERT_TRUE(table->field(0)->nullable());
+  ASSERT_EQ(1, table->field(0)->metadata()->keys().size());
+  ASSERT_FALSE(table->field(1)->nullable());
+  ASSERT_EQ(0, table->field(1)->metadata()->keys().size());
+
+  ASSERT_OK_AND_ASSIGN(BatchesWithCommonSchema batches_and_schema,
+                       DeclarationToExecBatches(plan));
+  ASSERT_TRUE(batches_and_schema.schema->field(0)->nullable());
+  ASSERT_FALSE(batches_and_schema.schema->field(1)->nullable());
+}
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <limits>
 #include <memory>
 #include <utility>
 
@@ -44,13 +45,31 @@ Fragment::Fragment(compute::Expression partition_expression,
       physical_schema_(std::move(physical_schema)) {}
 
 Future<std::shared_ptr<InspectedFragment>> Fragment::InspectFragment(
+    const FragmentScanOptions* format_options, compute::ExecContext* exec_context,
+    bool should_cache) {
+  util::Mutex::Guard lk = physical_schema_mutex_.Lock();
+  if (cached_inspected_fragment_) {
+    return cached_inspected_fragment_;
+  }
+  lk.Unlock();
+  return InspectFragmentImpl(format_options, exec_context)
+      .Then([this, should_cache](const std::shared_ptr<InspectedFragment>& frag) {
+        if (should_cache) {
+          util::Mutex::Guard lk = physical_schema_mutex_.Lock();
+          cached_inspected_fragment_ = frag;
+        }
+        return frag;
+      });
+}
+
+Future<std::shared_ptr<InspectedFragment>> Fragment::InspectFragmentImpl(
     const FragmentScanOptions* format_options, compute::ExecContext* exec_context) {
   return Status::NotImplemented("Inspect fragment");
 }
 
 Future<std::shared_ptr<FragmentScanner>> Fragment::BeginScan(
-    const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
-    const FragmentScanOptions* format_options, compute::ExecContext* exec_context) {
+    const FragmentScanRequest& request, InspectedFragment* inspected_fragment,
+    compute::ExecContext* exec_context) {
   return Status::NotImplemented("New scan method");
 }
 
@@ -156,42 +175,45 @@ Future<std::optional<int64_t>> InMemoryFragment::CountRows(
   return Future<std::optional<int64_t>>::MakeFinished(total);
 }
 
-Future<std::shared_ptr<InspectedFragment>> InMemoryFragment::InspectFragment(
+Future<std::shared_ptr<InspectedFragment>> InMemoryFragment::InspectFragmentImpl(
     const FragmentScanOptions* format_options, compute::ExecContext* exec_context) {
   return std::make_shared<InspectedFragment>(physical_schema_->field_names());
 }
 
 class InMemoryFragment::Scanner : public FragmentScanner {
  public:
-  explicit Scanner(InMemoryFragment* fragment) : fragment_(fragment) {}
+  explicit Scanner(std::vector<std::shared_ptr<RecordBatch>> batches)
+      : batches_(std::move(batches)) {}
 
-  Future<std::shared_ptr<RecordBatch>> ScanBatch(int batch_number) override {
-    return Future<std::shared_ptr<RecordBatch>>::MakeFinished(
-        fragment_->record_batches_[batch_number]);
+  AsyncGenerator<std::shared_ptr<RecordBatch>> RunScanTask(int batch_number) override {
+    DCHECK_EQ(batch_number, 0);
+    return MakeVectorGenerator(std::move(batches_));
   }
 
-  int64_t EstimatedDataBytes(int batch_number) override {
-    return arrow::util::TotalBufferSize(*fragment_->record_batches_[batch_number]);
-  }
+  int NumScanTasks() override { return 1; }
 
-  int NumBatches() override {
-    return static_cast<int>(fragment_->record_batches_.size());
+  int NumBatchesInScanTask(int task_number) override {
+    DCHECK_LE(batches_.size(),
+              static_cast<uint64_t>(std::numeric_limits<int32_t>::max()));
+    return static_cast<int>(batches_.size());
   }
 
  private:
-  InMemoryFragment* fragment_;
+  std::vector<std::shared_ptr<RecordBatch>> batches_;
 };
 
 Future<std::shared_ptr<FragmentScanner>> InMemoryFragment::BeginScan(
-    const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
-    const FragmentScanOptions* format_options, compute::ExecContext* exec_context) {
+    const FragmentScanRequest& request, InspectedFragment* inspected_fragment,
+    compute::ExecContext* exec_context) {
   return Future<std::shared_ptr<FragmentScanner>>::MakeFinished(
-      std::make_shared<InMemoryFragment::Scanner>(this));
+      std::make_shared<InMemoryFragment::Scanner>(record_batches_));
 }
 
-Dataset::Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression)
+Dataset::Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression,
+                 bool should_cache_metadata)
     : schema_(std::move(schema)),
-      partition_expression_(std::move(partition_expression)) {}
+      partition_expression_(std::move(partition_expression)),
+      should_cache_metadata_(should_cache_metadata) {}
 
 Result<std::shared_ptr<ScannerBuilder>> Dataset::NewScan() {
   return std::make_shared<ScannerBuilder>(this->shared_from_this());
@@ -246,7 +268,7 @@ struct VectorRecordBatchGenerator : InMemoryDataset::RecordBatchGenerator {
 
 InMemoryDataset::InMemoryDataset(std::shared_ptr<Schema> schema,
                                  RecordBatchVector batches)
-    : Dataset(std::move(schema)),
+    : Dataset(std::move(schema), /*should_cache_metadata=*/false),
       get_batches_(new VectorRecordBatchGenerator(std::move(batches))) {}
 
 struct TableRecordBatchGenerator : InMemoryDataset::RecordBatchGenerator {
@@ -263,7 +285,7 @@ struct TableRecordBatchGenerator : InMemoryDataset::RecordBatchGenerator {
 };
 
 InMemoryDataset::InMemoryDataset(std::shared_ptr<Table> table)
-    : Dataset(table->schema()),
+    : Dataset(table->schema(), /*should_cache_metadata=*/false),
       get_batches_(new TableRecordBatchGenerator(std::move(table))) {}
 
 Result<std::shared_ptr<Dataset>> InMemoryDataset::ReplaceSchema(

diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
@@ -109,25 +109,58 @@ struct ARROW_DS_EXPORT FragmentScanRequest {
   const FragmentScanOptions* format_scan_options;
 };
 
-/// \brief An iterator-like object that can yield batches created from a fragment
+/// \brief An abstraction over (potentially parallel) reading of a fragment
 class ARROW_DS_EXPORT FragmentScanner {
  public:
-  /// This instance will only be destroyed after all ongoing scan futures
+  /// This instance will only be destroyed after all ongoing scan tasks
   /// have been completed.
   ///
   /// This means any callbacks created as part of the scan can safely
   /// capture `this`
   virtual ~FragmentScanner() = default;
-  /// \brief Scan a batch of data from the file
-  /// \param batch_number The index of the batch to read
-  virtual Future<std::shared_ptr<RecordBatch>> ScanBatch(int batch_number) = 0;
-  /// \brief Calculate an estimate of how many data bytes the given batch will represent
+  /// \brief Run a task to scan a batches of data from a file
   ///
-  /// "Data bytes" should be the total size of all the buffers once the data has been
-  /// decoded into the Arrow format.
-  virtual int64_t EstimatedDataBytes(int batch_number) = 0;
-  /// \brief The number of batches in the fragment to scan
-  virtual int NumBatches() = 0;
+  /// Each scan task will generate a sequence of batches.  If a file supports multiple
+  /// scan tasks then the scan tasks should be able to run in parallel.
+  ///
+  /// For example, the CSV scanner currently generates a single stream of batches from
+  /// the start of the file to the end.  It is not capable of reading batches in parallel
+  /// and so there is a single scan task.
+  ///
+  /// The parquet scanner can read from different row groups concurrently.  Each row group
+  /// generates a sequence of batches (row groups can be very large and we may not want
+  /// to read the row group into memory all at once).
+  ///
+  /// Multiple scan tasks will be launched in parallel.  In other words, RunScanTask
+  /// will be called async-reentrantly (it will be called again before the future it
+  /// returns finishes)
+  ///
+  /// However, RunScanTask will not be called sync-reentrantly (it will not be
+  /// called again while a call to this method is in progress) and it will be called
+  /// in order.
+  ///
+  /// For example, RunScanTask(5) will always be called after RunScanTask(4) yet the
+  /// batches from scan task 4 may arrive before the batches from scan task 5 and this is
+  /// ok.  If the user desires ordered execution then batches will be sequenced later.
+  ///
+  /// \param task_number The index of the scan task to execute
+  virtual AsyncGenerator<std::shared_ptr<RecordBatch>> RunScanTask(int task_number) = 0;
+
+  /// \brief The total number of scan tasks that will be run
+  virtual int NumScanTasks() = 0;
+
+  static constexpr int kUnknownNumberOfBatches = -1;
+  /// \brief The total number of batches that will be delivered by a scan task
+  ///
+  /// Ideally, this will be known in advance by inspecting the metadata.  A fragment
+  /// scanner may choose to emit empty batches in order to respect this value.
+  ///
+  /// If it is not possible to know this in advance, then a fragment may return
+  /// FragmentScanner::kUnknownNumberOfBatches.  Note that doing so will have a
+  /// significant negative effect on scan parallelism because a scan task will not start
+  /// until we have determined how many batches precede it.  This means that any scan
+  /// tasks following this one will have to wait until this scan task is fully exhausted.
+  virtual int NumBatchesInScanTask(int task_number) = 0;
 };
 
 /// \brief Information learned about a fragment through inspection
@@ -140,8 +173,11 @@ class ARROW_DS_EXPORT FragmentScanner {
 /// names and use those column names to determine which columns to load
 /// from the CSV file.
 struct ARROW_DS_EXPORT InspectedFragment {
+  virtual ~InspectedFragment() = default;
+
   explicit InspectedFragment(std::vector<std::string> column_names)
       : column_names(std::move(column_names)) {}
+
   std::vector<std::string> column_names;
 };
 
@@ -175,12 +211,13 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
   /// information will be needed to figure out an evolution strategy.  This information
   /// will then be passed to the call to BeginScan
   virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
-      const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
+      const FragmentScanOptions* format_options, compute::ExecContext* exec_context,
+      bool should_cache);
 
   /// \brief Start a scan operation
   virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
-      const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
-      const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
+      const FragmentScanRequest& request, InspectedFragment* inspected_fragment,
+      compute::ExecContext* exec_context);
 
   /// \brief Count the number of rows in this fragment matching the filter using metadata
   /// only. That is, this method may perform I/O, but will not load data.
@@ -206,11 +243,14 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
   explicit Fragment(compute::Expression partition_expression,
                     std::shared_ptr<Schema> physical_schema);
 
+  virtual Future<std::shared_ptr<InspectedFragment>> InspectFragmentImpl(
+      const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
   virtual Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() = 0;
 
   util::Mutex physical_schema_mutex_;
   compute::Expression partition_expression_ = compute::literal(true);
   std::shared_ptr<Schema> physical_schema_;
+  std::shared_ptr<InspectedFragment> cached_inspected_fragment_;
 };
 
 /// \brief Per-scan options for fragment(s) in a dataset.
@@ -248,12 +288,11 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
       compute::Expression predicate,
       const std::shared_ptr<ScanOptions>& options) override;
 
-  Future<std::shared_ptr<InspectedFragment>> InspectFragment(
+  Future<std::shared_ptr<InspectedFragment>> InspectFragmentImpl(
       const FragmentScanOptions* format_options,
       compute::ExecContext* exec_context) override;
   Future<std::shared_ptr<FragmentScanner>> BeginScan(
-      const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
-      const FragmentScanOptions* format_options,
+      const FragmentScanRequest& request, InspectedFragment* inspected_fragment,
       compute::ExecContext* exec_context) override;
 
   std::string type_name() const override { return "in-memory"; }
@@ -348,6 +387,19 @@ MakeBasicDatasetEvolutionStrategy();
 /// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
 /// directory. A Dataset has a schema to which Fragments must align during a
 /// scan operation. This is analogous to Avro's reader and writer schema.
+///
+/// It is assumed that a dataset will always generate fragments in the same
+/// order.  Data in a dataset thus has an "implicit order" which is first
+/// decided by the fragment index and then the row index in a fragment.  For
+/// example, row 1 in fragment 10 comes after the last row in fragment 9.
+///
+/// A dataset will cache metadata by default.  This will enable future scans
+/// to be faster since they can skip some of the initial read steps.  However,
+/// if the dataset has many files, or if the file metadata itself is large, this
+/// cached metadata could occupy a large amount of RAM.
+///
+/// Metadata should not be cached if the contents of the files are expected
+/// to change between scans.
 class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
  public:
   /// \brief Begin to build a new Scan operation against this Dataset
@@ -385,9 +437,15 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
   virtual ~Dataset() = default;
 
  protected:
-  explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
+  /// \brief Create a new dataset
+  /// \param schema the dataset schema.  This is the unified schema across all fragments
+  /// \param should_cache_metadata if true then this dataset instance should try and cache
+  ///            metadata information during a scan.
+  explicit Dataset(std::shared_ptr<Schema> schema, bool should_cache_metadata = true)
+      : schema_(std::move(schema)), should_cache_metadata_(should_cache_metadata) {}
 
-  Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression);
+  Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression,
+          bool should_cache_metadata = true);
 
   virtual Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) = 0;
   /// \brief Default non-virtual implementation method for the base
@@ -405,6 +463,8 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
 
   std::shared_ptr<Schema> schema_;
   compute::Expression partition_expression_ = compute::literal(true);
+  bool should_cache_metadata_;
+
   std::unique_ptr<DatasetEvolutionStrategy> evolution_strategy_ =
       MakeBasicDatasetEvolutionStrategy();
 };
@@ -427,7 +487,8 @@ class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
   /// Construct a dataset from a schema and a factory of record batch iterators.
   InMemoryDataset(std::shared_ptr<Schema> schema,
                   std::shared_ptr<RecordBatchGenerator> get_batches)
-      : Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
+      : Dataset(std::move(schema), /*should_cache_metadata=*/false),
+        get_batches_(std::move(get_batches)) {}
 
   /// Convenience constructor taking a fixed list of batches
   InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);