Skip to content

Commit

Permalink
refactor(bb): simplify parallel_for_if_effective (#8079)
Browse files Browse the repository at this point in the history
The chunkiness of `run_loop_in_parallel_if_effective` was getting in the
way of doing polynomial refactor passes for structured polynomials.

- Renamed parallel for variants to all start with parallel_for
- Doing math on the spot with constants was simpler, plus other
simplifications.
- Removed the variant that takes two parameters, and introduced a one
parameter variant as just having an 'i' parameter felt like the sweet
spot, plus a lot of places where the thread/chunk index was not used I
do plan to use in a followup as we can avoid mutexes that way and be
more efficient.

Bundled changes:
- There was an unnecessary polynomial copy in IPA
- Also, introduce a SlabVector class where we were using
SlabContainerAllocator, with plans to use it more (and possibly make it
equal to normal std::vector for native code where memory fragmentation
really doesn't matter)
  • Loading branch information
ludamad authored Aug 20, 2024
1 parent f668907 commit 5bff26b
Show file tree
Hide file tree
Showing 20 changed files with 189 additions and 400 deletions.
119 changes: 40 additions & 79 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "barretenberg/commitment_schemes/claim.hpp"
#include "barretenberg/commitment_schemes/verification_key.hpp"
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
#include "barretenberg/transcript/transcript.hpp"
#include <cstddef>
Expand Down Expand Up @@ -90,8 +91,6 @@ template <typename Curve_> class IPA {
#ifdef IPA_FUZZ_TEST
friend class ProxyCaller;
#endif
// clang-format off

/**
* @brief Compute an inner product argument proof for opening a single polynomial at a single evaluation point.
*
Expand Down Expand Up @@ -128,16 +127,14 @@ template <typename Curve_> class IPA {
*
*7. Send the final \f$\vec{a}_{0} = (a_0)\f$ to the verifier
*/
template <typename Transcript>
static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
const ProverOpeningClaim<Curve>& opening_claim,
const std::shared_ptr<Transcript>& transcript)
{

Polynomial polynomial = opening_claim.polynomial;
template <typename Transcript>
static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
const ProverOpeningClaim<Curve>& opening_claim,
const std::shared_ptr<Transcript>& transcript)
{
const Polynomial& polynomial = opening_claim.polynomial;

// clang-format on
auto poly_length = static_cast<size_t>(polynomial.size());
size_t poly_length = polynomial.size();

// Step 1.
// Send polynomial degree + 1 = d to the verifier
Expand Down Expand Up @@ -169,36 +166,27 @@ template <typename Curve_> class IPA {
// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&G_vec_local, srs_elements](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/0,
/*finite_field_inversions_per_iteration=*/0,
/*group_element_additions_per_iteration=*/0,
/*group_element_doublings_per_iteration=*/0,
/*scalar_multiplications_per_iteration=*/0,
/*sequential_copy_ops_per_iteration=*/1);
}, thread_heuristics::FF_COPY_COST);

// Step 5.
// Compute vector b (vector of the powers of the challenge)
OpeningPair<Curve> opening_pair = opening_claim.opening_pair;
std::vector<Fr> b_vec(poly_length);
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&b_vec, &opening_pair](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
Fr b_power = opening_pair.challenge.pow(start);
for (size_t i = start; i < end; i++) {
b_vec[i] = b_power;
b_power *= opening_pair.challenge;
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/1);
}, thread_heuristics::FF_COPY_COST + thread_heuristics::FF_MULTIPLICATION_COST);

// Iterate for log(poly_degree) rounds to compute the round commitments.
auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_length));
Expand All @@ -221,18 +209,9 @@ template <typename Curve_> class IPA {
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
// Run scalar products in parallel
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
round_size,
[&a_vec,
&b_vec,
round_size,
&inner_prod_L,
&inner_prod_R
#ifndef NO_MULTITHREADING
,
&inner_product_accumulation_mutex
#endif
](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
Expand All @@ -247,9 +226,7 @@ template <typename Curve_> class IPA {
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
}
},
/*finite_field_additions_per_iteration=*/2,
/*finite_field_multiplications_per_iteration=*/2);
}, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);

// Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( )
// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
Expand Down Expand Up @@ -281,29 +258,24 @@ template <typename Curve_> class IPA {
// Step 6.e
// G_vec_new = G_vec_lo + G_vec_hi * round_challenge_inv
auto G_hi_by_inverse_challenge = GroupElement::batch_mul_with_endomorphism(
std::span{ G_vec_local.begin() + static_cast<long>(round_size),
G_vec_local.begin() + static_cast<long>(round_size * 2) },
std::span{ G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size),
G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size * 2) },
round_challenge_inv);
GroupElement::batch_affine_add(
std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size) },
G_hi_by_inverse_challenge,
G_vec_local);

// Steps 6.e and 6.f
// Update the vectors a_vec, b_vec.
// a_vec_new = a_vec_lo + a_vec_hi * round_challenge
// b_vec_new = b_vec_lo + b_vec_hi * round_challenge_inv
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
round_size,
[&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) {
for (size_t j = start; j < end; j++) {
a_vec[j] += round_challenge * a_vec[round_size + j];
b_vec[j] += round_challenge_inv * b_vec[round_size + j];
}
},
/*finite_field_additions_per_iteration=*/4,
/*finite_field_multiplications_per_iteration=*/8,
/*finite_field_inversions_per_iteration=*/1);
[&](size_t j) {
a_vec[j] += round_challenge * a_vec[round_size + j];
b_vec[j] += round_challenge_inv * b_vec[round_size + j];
}, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);
}

// Step 7
Expand Down Expand Up @@ -409,23 +381,19 @@ template <typename Curve_> class IPA {
// TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its
// O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're
// leaving it unoptimized for now.
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&s_vec, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
}
[&](size_t i) {
Fr s_vec_scalar = Fr::one();
for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
}
s_vec[i] = s_vec_scalar;
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/log_poly_degree);
s_vec[i] = s_vec_scalar;
}, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree);

auto* srs_elements = vk->get_monomial_points();

Expand All @@ -435,20 +403,13 @@ template <typename Curve_> class IPA {
// The SRS stored in the commitment key is the result after applying the pippenger point table so the
// values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
// G_vec_local should use only the original SRS thus we extract only the even indices.
run_loop_in_parallel_if_effective(
parallel_for_heuristic(
poly_length,
[&G_vec_local, srs_elements](size_t start, size_t end) {
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t i = start * 2; i < end * 2; i += 2) {
G_vec_local[i >> 1] = srs_elements[i];
}
},
/*finite_field_additions_per_iteration=*/0,
/*finite_field_multiplications_per_iteration=*/0,
/*finite_field_inversions_per_iteration=*/0,
/*group_element_additions_per_iteration=*/0,
/*group_element_doublings_per_iteration=*/0,
/*scalar_multiplications_per_iteration=*/0,
/*sequential_copy_ops_per_iteration=*/1);
}, thread_heuristics::FF_COPY_COST * 2);

// Step 8.
// Compute G₀
Expand Down Expand Up @@ -497,7 +458,7 @@ template <typename Curve_> class IPA {
// Ensure polynomial length cannot be changed from its default specified valued
poly_length_var.fix_witness();

const uint32_t poly_length = static_cast<uint32_t>(poly_length_var.get_value());
const auto poly_length = static_cast<uint32_t>(poly_length_var.get_value());

// Step 2.
// Receive generator challenge u and compute auxiliary generator
Expand Down Expand Up @@ -559,7 +520,7 @@ template <typename Curve_> class IPA {
// O(nlogn). This can be optimized to be linear by computing a tree of products.
for (size_t i = 0; i < poly_length; i++) {
Fr s_vec_scalar = Fr(1);
for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
auto bit = (i >> j) & 1;
bool b = static_cast<bool>(bit);
if (b) {
Expand Down
4 changes: 3 additions & 1 deletion barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@
#else
#define BB_LIKELY(x) x
#define BB_UNLIKELY(x) x
#endif
#endif

#define BB_UNUSED [[maybe_unused]]
6 changes: 6 additions & 0 deletions barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <map>
#include <memory>
#include <unordered_map>
#include <vector>
#ifndef NO_MULTITHREADING
#include <mutex>
#endif
Expand Down Expand Up @@ -75,4 +76,9 @@ template <typename T> class ContainerSlabAllocator {
}
};

/**
* @brief A vector that uses the slab allocator.
*/
template <typename T> using SlabVector = std::vector<T, bb::ContainerSlabAllocator<T>>;

} // namespace bb
74 changes: 9 additions & 65 deletions barretenberg/cpp/src/barretenberg/common/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
* @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
*
*/
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal)
void parallel_for_range(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal)
{
if (num_points <= no_multhreading_if_less_or_equal) {
func(0, num_points);
Expand Down Expand Up @@ -129,45 +129,10 @@ void run_loop_in_parallel(size_t num_points,
});
};

/**
* @brief Split a loop into several loops running in parallel based on operations in 1 iteration
*
* @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
* that should contain the work loop, but only if it's worth it
* @param num_points Total number of elements
* @param func A function or lambda expression with a for loop inside, for example:
* [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
* Or for the version with index:
* [](size_t start, size_t end,size_t workload_index){for (size_t i=start; i<end; i++){(void)i;}}
* @param finite_field_additions_per_iteration The number of additions/subtractions/negations
* @param finite_field_multiplications_per_iteration The number of finite field multiplications and squarings
* @param finite_field_inversions_per_iteration
* @param group_element_additions_per_iteration Projective addition number
* @param group_element_doublings_per_iteration Projective doubling number
* @param scalar_multiplications_per_iteration
* @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number
*/
template <typename FunctionType>
requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
void run_loop_in_parallel_if_effective_internal(size_t num_points,
const FunctionType& func,
size_t finite_field_additions_per_iteration,
size_t finite_field_multiplications_per_iteration,
size_t finite_field_inversions_per_iteration,
size_t group_element_additions_per_iteration,
size_t group_element_doublings_per_iteration,
size_t scalar_multiplications_per_iteration,
size_t sequential_copy_ops_per_iteration)
void parallel_for_heuristic(size_t num_points,
const std::function<void(size_t, size_t, size_t)>& func,
size_t heuristic_cost)
{
// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds):
constexpr size_t FF_ADDITION_COST = 4;
constexpr size_t FF_MULTIPLICATION_COST = 21;
constexpr size_t FF_INVERSION_COST = 7000;
constexpr size_t GE_ADDITION_COST = 350;
constexpr size_t GE_DOUBLING_COST = 194;
constexpr size_t SM_COST = 50000;
constexpr size_t SEQ_COPY_COST = 3;
// We take the maximum observed parallel_for cost (388 us) and round it up.
// The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we
// can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium
Expand All @@ -180,23 +145,11 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);

// Compute the cost of all operations done by other threads
const size_t offset_cost =
(num_points - chunk_size) *
(finite_field_additions_per_iteration * FF_ADDITION_COST +
finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST +
finite_field_inversions_per_iteration * FF_INVERSION_COST +
group_element_additions_per_iteration * GE_ADDITION_COST +
group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST +
sequential_copy_ops_per_iteration * SEQ_COPY_COST);
const size_t offset_cost = (num_points - chunk_size) * heuristic_cost;

// If starting parallel for is longer than computing, just compute
if (offset_cost < PARALLEL_FOR_COST) {
if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(0, num_points);
} else {
func(0, num_points, 0);
}
func(0, num_points, 0);
return;
}
// Parallelize over chunks
Expand All @@ -213,18 +166,9 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
size_t start = chunk_index * chunk_size;
size_t end = chunk_index * chunk_size + current_chunk_size;

if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(start, end);
} else {
func(start, end, chunk_index);
}
func(start, end, chunk_index);
});
};
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);

/**
* @brief calculates number of threads to create based on minimum iterations per thread
Expand Down
Loading

1 comment on commit 5bff26b

@AztecBot
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'C++ Benchmark'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.05.

Benchmark suite Current: 5bff26b Previous: 4cb5c83 Ratio
nativeconstruct_proof_ultrahonk_power_of_2/20 5375.36291100001 ms/iter 4944.324507000005 ms/iter 1.09

This comment was automatically generated by workflow using github-action-benchmark.

CC: @ludamad @codygunton

Please sign in to comment.