diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index b12f234f72bdf..2b85fe6c21317 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" @@ -37,6 +38,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/async_generator.h" +#include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" @@ -104,13 +106,28 @@ std::shared_ptr MakeSchema(Repetition::type repetition) { repetition == Repetition::REPEATED); } -template +template +int64_t BytesForItems(int64_t num_items) { + static_assert(!std::is_same_v, + "BytesForItems unsupported for FLBAType"); + return num_items * sizeof(typename ParquetType::c_type); +} + +template <> +int64_t BytesForItems(int64_t num_items) { + return ::arrow::bit_util::BytesForBits(num_items); +} + +template <> +int64_t BytesForItems(int64_t num_items) { + return num_items * sizeof(uint16_t); +} + +template void SetBytesProcessed(::benchmark::State& state, int64_t num_values = BENCHMARK_SIZE) { const int64_t items_processed = state.iterations() * num_values; - const int64_t bytes_processed = items_processed * sizeof(typename ParquetType::c_type); - - state.SetItemsProcessed(bytes_processed); - state.SetBytesProcessed(bytes_processed); + state.SetItemsProcessed(items_processed); + state.SetBytesProcessed(BytesForItems(items_processed)); } constexpr int64_t kAlternatingOrNa = -1; @@ -188,7 +205,7 @@ static void BM_WriteColumn(::benchmark::State& state) { EXIT_NOT_OK( WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE)); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int32Type); @@ -332,7 +349,7 @@ static void BM_ReadColumn(::benchmark::State& state) { auto properties = WriterProperties::Builder().disable_dictionary()->build(); BenchmarkReadTable(state, *table, properties, table->num_rows(), - sizeof(typename ParquetType::c_type) * table->num_rows()); + BytesForItems(table->num_rows())); } // There are two parameters here that cover different data distributions. @@ -408,7 +425,7 @@ static void BM_ReadColumnPlain(::benchmark::State& state) { auto properties = WriterProperties::Builder().disable_dictionary()->build(); BenchmarkReadTable(state, *table, properties, table->num_rows(), - sizeof(c_type) * table->num_rows()); + BytesForItems(table->num_rows())); } BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type) @@ -664,7 +681,7 @@ static void BM_ReadIndividualRowGroups(::benchmark::State& state) { std::shared_ptr<::arrow::Table> final_table; PARQUET_ASSIGN_OR_THROW(final_table, ConcatenateTables(tables)); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK(BM_ReadIndividualRowGroups); @@ -688,7 +705,7 @@ static void BM_ReadMultipleRowGroups(::benchmark::State& state) { std::shared_ptr<::arrow::Table> table; EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table)); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK(BM_ReadMultipleRowGroups); @@ -716,7 +733,7 @@ static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) { ASSIGN_OR_ABORT(auto batches, fut.result()); ASSIGN_OR_ABORT(auto actual, ::arrow::Table::FromRecordBatches(std::move(batches))); } - SetBytesProcessed(state); + SetBytesProcessed(state); } BENCHMARK(BM_ReadMultipleRowGroupsGenerator);