Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Remove mutex dependency #4160

Merged
merged 1 commit into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ template <typename Curve> class IPA {
transcript->send_to_verifier("IPA:poly_degree", static_cast<uint64_t>(poly_degree));
const Fr generator_challenge = transcript->get_challenge("IPA:generator_challenge");
auto aux_generator = Commitment::one() * generator_challenge;

// Checks poly_degree is greater than zero and a power of two
// In the future, we might want to consider if non-powers of two are needed
ASSERT((poly_degree > 0) && (!(poly_degree & (poly_degree - 1))) &&
Expand Down Expand Up @@ -90,30 +89,39 @@ template <typename Curve> class IPA {
std::vector<GroupElement> R_elements(log_poly_degree);
std::size_t round_size = poly_degree;

// Allocate vectors for parallel storage of partial products
const size_t num_cpus = get_num_cpus();
std::vector<Fr> partial_inner_prod_L(num_cpus);
std::vector<Fr> partial_inner_prod_R(num_cpus);
// Perform IPA rounds
for (size_t i = 0; i < log_poly_degree; i++) {
round_size >>= 1;
// Set partial products to zero
memset(&partial_inner_prod_L[0], 0, sizeof(Fr) * num_cpus);
memset(&partial_inner_prod_R[0], 0, sizeof(Fr) * num_cpus);
// Compute inner_prod_L := < a_vec_lo, b_vec_hi > and inner_prod_R := < a_vec_hi, b_vec_lo >
std::mutex addition_lock;
Fr inner_prod_L = Fr::zero();
Fr inner_prod_R = Fr::zero();
// Run scalar product in parallel
run_loop_in_parallel_if_effective(
run_loop_in_parallel_if_effective_with_index(
round_size,
[&a_vec, &b_vec, &inner_prod_L, &inner_prod_R, round_size, &addition_lock](size_t start, size_t end) {
[&a_vec, &b_vec, round_size, &partial_inner_prod_L, &partial_inner_prod_R](
size_t start, size_t end, size_t workload_index) {
Fr current_inner_prod_L = Fr::zero();
Fr current_inner_prod_R = Fr::zero();
for (size_t j = start; j < end; j++) {
current_inner_prod_L += a_vec[j] * b_vec[round_size + j];
current_inner_prod_R += a_vec[round_size + j] * b_vec[j];
}
addition_lock.lock();
inner_prod_L += current_inner_prod_L;
inner_prod_R += current_inner_prod_R;
addition_lock.unlock();
partial_inner_prod_L[workload_index] = current_inner_prod_L;
partial_inner_prod_R[workload_index] = current_inner_prod_R;
},
/*finite_field_additions_per_iteration=*/2,
/*finite_field_multiplications_per_iteration=*/2);
for (size_t j = 0; j < num_cpus; j++) {
inner_prod_L += partial_inner_prod_L[j];
inner_prod_R += partial_inner_prod_R[j];
}

// L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
L_elements[i] = bb::scalar_multiplication::pippenger_without_endomorphism_basis_points<Curve>(
Expand Down
44 changes: 32 additions & 12 deletions barretenberg/cpp/src/barretenberg/common/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ void run_loop_in_parallel(size_t num_points,
* @param num_points Total number of elements
* @param func A function or lambda expression with a for loop inside, for example:
* [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
* Or for the version with index:
* [](size_t start, size_t end,size_t workload_index){for (size_t i=start; i<end; i++){(void)i;}}
* @param finite_field_additions_per_iteration The number of additions/subtractions/negations
* @param finite_field_multiplications_per_iteration The number of finite field multiplications and squarings
* @param finite_field_inversions_per_iteration
Expand All @@ -144,15 +146,18 @@ void run_loop_in_parallel(size_t num_points,
* @param scalar_multiplications_per_iteration
* @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number
*/
void run_loop_in_parallel_if_effective(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t finite_field_additions_per_iteration,
size_t finite_field_multiplications_per_iteration,
size_t finite_field_inversions_per_iteration,
size_t group_element_additions_per_iteration,
size_t group_element_doublings_per_iteration,
size_t scalar_multiplications_per_iteration,
size_t sequential_copy_ops_per_iteration)
template <typename FunctionType>
requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
void run_loop_in_parallel_if_effective_internal(size_t num_points,
const FunctionType& func,
size_t finite_field_additions_per_iteration,
size_t finite_field_multiplications_per_iteration,
size_t finite_field_inversions_per_iteration,
size_t group_element_additions_per_iteration,
size_t group_element_doublings_per_iteration,
size_t scalar_multiplications_per_iteration,
size_t sequential_copy_ops_per_iteration)
{
// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds):
constexpr size_t FF_ADDITION_COST = 4;
Expand Down Expand Up @@ -185,7 +190,12 @@ void run_loop_in_parallel_if_effective(size_t num_points,

// If starting parallel for is longer than computing, just compute
if (offset_cost < PARALLEL_FOR_COST) {
func(0, num_points);
if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(0, num_points);
} else {
func(0, num_points, 0);
}
return;
}
// Parallelize over chunks
Expand All @@ -201,6 +211,16 @@ void run_loop_in_parallel_if_effective(size_t num_points,
}
size_t start = chunk_index * chunk_size;
size_t end = chunk_index * chunk_size + current_chunk_size;
func(start, end);

if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {

func(start, end);
} else {
func(start, end, chunk_index);
}
});
};
};
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
template void run_loop_in_parallel_if_effective_internal(
size_t, const std::function<void(size_t, size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
73 changes: 64 additions & 9 deletions barretenberg/cpp/src/barretenberg/common/thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,67 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
void run_loop_in_parallel(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t no_multhreading_if_less_or_equal = 0);
void run_loop_in_parallel_if_effective(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t finite_field_additions_per_iteration = 0,
size_t finite_field_multiplications_per_iteration = 0,
size_t finite_field_inversions_per_iteration = 0,
size_t group_element_additions_per_iteration = 0,
size_t group_element_doublings_per_iteration = 0,
size_t scalar_multiplications_per_iteration = 0,
size_t sequential_copy_ops_per_iteration = 0);

template <typename FunctionType>
requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
void run_loop_in_parallel_if_effective_internal(
size_t, const FunctionType&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
/**
* @brief Runs loop in parallel if parallelization if useful (costs less than the algorith)
*
* @details Please see run_loop_in_parallel_if_effective_internal for detailed description
*
*/
inline void run_loop_in_parallel_if_effective(size_t num_points,
const std::function<void(size_t, size_t)>& func,
size_t finite_field_additions_per_iteration = 0,
size_t finite_field_multiplications_per_iteration = 0,
size_t finite_field_inversions_per_iteration = 0,
size_t group_element_additions_per_iteration = 0,
size_t group_element_doublings_per_iteration = 0,
size_t scalar_multiplications_per_iteration = 0,
size_t sequential_copy_ops_per_iteration = 0

)
{
run_loop_in_parallel_if_effective_internal(num_points,
func,
finite_field_additions_per_iteration,
finite_field_multiplications_per_iteration,
finite_field_inversions_per_iteration,
group_element_additions_per_iteration,
group_element_doublings_per_iteration,
scalar_multiplications_per_iteration,
sequential_copy_ops_per_iteration);
}

/**
* @brief Runs loop in parallel if parallelization if useful (costs less than the algorith). The loop function is given
* the index of the workload.
*
* @details Please see run_loop_in_parallel_if_effective_internal for detailed description
*
*/
inline void run_loop_in_parallel_if_effective_with_index(size_t num_points,
const std::function<void(size_t, size_t, size_t)>& func,
size_t finite_field_additions_per_iteration = 0,
size_t finite_field_multiplications_per_iteration = 0,
size_t finite_field_inversions_per_iteration = 0,
size_t group_element_additions_per_iteration = 0,
size_t group_element_doublings_per_iteration = 0,
size_t scalar_multiplications_per_iteration = 0,
size_t sequential_copy_ops_per_iteration = 0

)
{
run_loop_in_parallel_if_effective_internal(num_points,
func,
finite_field_additions_per_iteration,
finite_field_multiplications_per_iteration,
finite_field_inversions_per_iteration,
group_element_additions_per_iteration,
group_element_doublings_per_iteration,
scalar_multiplications_per_iteration,
sequential_copy_ops_per_iteration);
}