Skip to content

Commit

Permalink
apacheGH-44081: [C++][Parquet] Fix reported metrics in parquet-arrow-…
Browse files Browse the repository at this point in the history
…reader-writer-benchmark

1. items/sec and bytes/sec were set to the same value in some benchmarks
2. bytes/sec was incorrectly computed for boolean columns
  • Loading branch information
pitrou committed Sep 12, 2024
1 parent 9986b7b commit 9076d07
Showing 1 changed file with 28 additions and 11 deletions.
39 changes: 28 additions & 11 deletions cpp/src/parquet/arrow/reader_writer_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <array>
#include <iostream>
#include <random>
#include <type_traits>

#include "parquet/arrow/reader.h"
#include "parquet/arrow/writer.h"
Expand All @@ -37,6 +38,7 @@
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/util/async_generator.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/logging.h"

Expand Down Expand Up @@ -104,13 +106,28 @@ std::shared_ptr<ColumnDescriptor> MakeSchema(Repetition::type repetition) {
repetition == Repetition::REPEATED);
}

template <bool nullable, typename ParquetType>
template <typename ParquetType>
int64_t BytesForItems(int64_t num_items) {
static_assert(!std::is_same_v<ParquetType, FLBAType>,
"BytesForItems unsupported for FLBAType");
return num_items * sizeof(typename ParquetType::c_type);
}

template <>
int64_t BytesForItems<BooleanType>(int64_t num_items) {
return ::arrow::bit_util::BytesForBits(num_items);
}

template <>
int64_t BytesForItems<Float16LogicalType>(int64_t num_items) {
return num_items * sizeof(uint16_t);
}

template <typename ParquetType>
void SetBytesProcessed(::benchmark::State& state, int64_t num_values = BENCHMARK_SIZE) {
const int64_t items_processed = state.iterations() * num_values;
const int64_t bytes_processed = items_processed * sizeof(typename ParquetType::c_type);

state.SetItemsProcessed(bytes_processed);
state.SetBytesProcessed(bytes_processed);
state.SetItemsProcessed(items_processed);
state.SetBytesProcessed(BytesForItems<ParquetType>(items_processed));
}

constexpr int64_t kAlternatingOrNa = -1;
Expand Down Expand Up @@ -188,7 +205,7 @@ static void BM_WriteColumn(::benchmark::State& state) {
EXIT_NOT_OK(
WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
}
SetBytesProcessed<nullable, ParquetType>(state);
SetBytesProcessed<ParquetType>(state);
}

BENCHMARK_TEMPLATE2(BM_WriteColumn, false, Int32Type);
Expand Down Expand Up @@ -332,7 +349,7 @@ static void BM_ReadColumn(::benchmark::State& state) {
auto properties = WriterProperties::Builder().disable_dictionary()->build();

BenchmarkReadTable(state, *table, properties, table->num_rows(),
sizeof(typename ParquetType::c_type) * table->num_rows());
BytesForItems<ParquetType>(table->num_rows()));
}

// There are two parameters here that cover different data distributions.
Expand Down Expand Up @@ -408,7 +425,7 @@ static void BM_ReadColumnPlain(::benchmark::State& state) {

auto properties = WriterProperties::Builder().disable_dictionary()->build();
BenchmarkReadTable(state, *table, properties, table->num_rows(),
sizeof(c_type) * table->num_rows());
BytesForItems<ParquetType>(table->num_rows()));
}

BENCHMARK_TEMPLATE2(BM_ReadColumnPlain, false, Int32Type)
Expand Down Expand Up @@ -664,7 +681,7 @@ static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
std::shared_ptr<::arrow::Table> final_table;
PARQUET_ASSIGN_OR_THROW(final_table, ConcatenateTables(tables));
}
SetBytesProcessed<true, Int64Type>(state);
SetBytesProcessed<Int64Type>(state);
}

BENCHMARK(BM_ReadIndividualRowGroups);
Expand All @@ -688,7 +705,7 @@ static void BM_ReadMultipleRowGroups(::benchmark::State& state) {
std::shared_ptr<::arrow::Table> table;
EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table));
}
SetBytesProcessed<true, Int64Type>(state);
SetBytesProcessed<Int64Type>(state);
}

BENCHMARK(BM_ReadMultipleRowGroups);
Expand Down Expand Up @@ -716,7 +733,7 @@ static void BM_ReadMultipleRowGroupsGenerator(::benchmark::State& state) {
ASSIGN_OR_ABORT(auto batches, fut.result());
ASSIGN_OR_ABORT(auto actual, ::arrow::Table::FromRecordBatches(std::move(batches)));
}
SetBytesProcessed<true, Int64Type>(state);
SetBytesProcessed<Int64Type>(state);
}

BENCHMARK(BM_ReadMultipleRowGroupsGenerator);
Expand Down

0 comments on commit 9076d07

Please sign in to comment.