From 94922fc24e728100b456ed5f0203974964fd9f83 Mon Sep 17 00:00:00 2001 From: Cody Gunton Date: Tue, 12 Mar 2024 17:50:39 -0400 Subject: [PATCH] feat: Multithreaded prover folding (#5147) We only get an 80% reduction on 16 threads, but it's still a nice improvement of overall time. ## Before ``` Benchmarking lock created at ~/BENCHMARK_IN_PROGRESS. client_ivc_bench 100% 15MB 48.4MB/s 00:00 2024-03-12T03:38:23+00:00 Running ./client_ivc_bench Run on (16 X 3000 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 1024 KiB (x8) L3 Unified 36608 KiB (x1) Load Average: 0.00, 0.00, 0.11 -------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... -------------------------------------------------------------------------------- ClientIVCBench/Full/6 29446 ms 24872 ms 1 Decider::construct_proof=1 Decider::construct_proof(t)=753.082M ECCVMComposer::compute_commitment_key=1 ECCVMComposer::compute_commitment_key(t)=3.78026M ECCVMComposer::compute_witness=1 ECCVMComposer::compute_witness(t)=1.73739G ECCVMComposer::create_prover=1 ECCVMComposer::create_prover(t)=3.40893G ECCVMComposer::create_proving_key=1 ECCVMComposer::create_proving_key(t)=1.66752G ECCVMProver::construct_proof=1 ECCVMProver::construct_proof(t)=1.76066G Goblin::merge=11 Goblin::merge(t)=128.627M GoblinTranslatorCircuitBuilder::constructor=1 GoblinTranslatorCircuitBuilder::constructor(t)=56.5845M GoblinTranslatorComposer::create_prover=1 GoblinTranslatorComposer::create_prover(t)=123.279M GoblinTranslatorProver::construct_proof=1 GoblinTranslatorProver::construct_proof(t)=927.524M ProtoGalaxyProver_::accumulator_update_round=10 ProtoGalaxyProver_::accumulator_update_round(t)=3.46156G ProtoGalaxyProver_::combiner_quotient_round=10 ProtoGalaxyProver_::combiner_quotient_round(t)=7.17713G ProtoGalaxyProver_::perturbator_round=10 ProtoGalaxyProver_::perturbator_round(t)=1.38221G ProtoGalaxyProver_::preparation_round=10 ProtoGalaxyProver_::preparation_round(t)=4.1G ProtogalaxyProver::fold_instances=10 ProtogalaxyProver::fold_instances(t)=16.1209G ProverInstance(Circuit&)=11 ProverInstance(Circuit&)(t)=1.945G batch_mul_with_endomorphism=30 batch_mul_with_endomorphism(t)=562.528M commit=425 commit(t)=3.96966G compute_combiner=10 compute_combiner(t)=7.175G compute_perturbator=9 compute_perturbator(t)=1.38188G compute_univariate=48 compute_univariate(t)=1.41821G construct_circuits=6 construct_circuits(t)=4.20217G Benchmarking lock deleted. client_ivc_bench.json 100% 4015 130.4KB/s 00:00 function ms % sum construct_circuits(t) 4202 14.31% ProverInstance(Circuit&)(t) 1945 6.62% ProtogalaxyProver::fold_instances(t) 16121 54.89% Decider::construct_proof(t) 753 2.56% ECCVMComposer::create_prover(t) 3409 11.61% GoblinTranslatorComposer::create_prover(t) 123 0.42% ECCVMProver::construct_proof(t) 1761 5.99% GoblinTranslatorProver::construct_proof(t) 928 3.16% Goblin::merge(t) 129 0.44% Total time accounted for: 29370ms/29446ms = 99.74% Major contributors: function ms % sum commit(t) 3970 13.52% compute_combiner(t) 7175 24.43% compute_perturbator(t) 1382 4.71% compute_univariate(t) 1418 4.83% Breakdown of ECCVMProver::create_prover: ECCVMComposer::compute_witness(t) 1737 50.97% ECCVMComposer::create_proving_key(t) 1668 48.92% Breakdown of ProtogalaxyProver::fold_instances: ProtoGalaxyProver_::preparation_round(t) 4100 25.43% ProtoGalaxyProver_::perturbator_round(t) 1382 8.57% ProtoGalaxyProver_::combiner_quotient_round(t) 7177 44.52% ProtoGalaxyProver_::accumulator_update_round(t) 3462 21.47% ``` ## After ``` Benchmarking lock created at ~/BENCHMARK_IN_PROGRESS. client_ivc_bench 100% 15MB 53.8MB/s 00:00 2024-03-12T13:10:44+00:00 Running ./client_ivc_bench Run on (16 X 3606.49 MHz CPU s) CPU Caches: L1 Data 32 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 1024 KiB (x8) L3 Unified 36608 KiB (x1) Load Average: 0.00, 0.92, 1.95 -------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... -------------------------------------------------------------------------------- ClientIVCBench/Full/6 26935 ms 22185 ms 1 Decider::construct_proof=1 Decider::construct_proof(t)=760.055M ECCVMComposer::compute_commitment_key=1 ECCVMComposer::compute_commitment_key(t)=3.6897M ECCVMComposer::compute_witness=1 ECCVMComposer::compute_witness(t)=1.74049G ECCVMComposer::create_prover=1 ECCVMComposer::create_prover(t)=3.41363G ECCVMComposer::create_proving_key=1 ECCVMComposer::create_proving_key(t)=1.66922G ECCVMProver::construct_proof=1 ECCVMProver::construct_proof(t)=1.76915G Goblin::merge=11 Goblin::merge(t)=128.378M GoblinTranslatorCircuitBuilder::constructor=1 GoblinTranslatorCircuitBuilder::constructor(t)=57.5506M GoblinTranslatorComposer::create_prover=1 GoblinTranslatorComposer::create_prover(t)=121.602M GoblinTranslatorProver::construct_proof=1 GoblinTranslatorProver::construct_proof(t)=927.714M ProtoGalaxyProver_::accumulator_update_round=10 ProtoGalaxyProver_::accumulator_update_round(t)=722M ProtoGalaxyProver_::combiner_quotient_round=10 ProtoGalaxyProver_::combiner_quotient_round(t)=7.24835G ProtoGalaxyProver_::perturbator_round=10 ProtoGalaxyProver_::perturbator_round(t)=1.39394G ProtoGalaxyProver_::preparation_round=10 ProtoGalaxyProver_::preparation_round(t)=4.11739G ProtogalaxyProver::fold_instances=10 ProtogalaxyProver::fold_instances(t)=13.4817G ProverInstance(Circuit&)=11 ProverInstance(Circuit&)(t)=2.0017G batch_mul_with_endomorphism=30 batch_mul_with_endomorphism(t)=564.354M commit=425 commit(t)=3.98668G compute_combiner=10 compute_combiner(t)=7.24616G compute_perturbator=9 compute_perturbator(t)=1.3936G compute_univariate=48 compute_univariate(t)=1.42762G construct_circuits=6 construct_circuits(t)=4.25115G Benchmarking lock deleted. client_ivc_bench.json 100% 4021 139.1KB/s 00:00 function ms % sum construct_circuits(t) 4251 15.83% ProverInstance(Circuit&)(t) 2002 7.45% ProtogalaxyProver::fold_instances(t) 13482 50.20% Decider::construct_proof(t) 760 2.83% ECCVMComposer::create_prover(t) 3414 12.71% GoblinTranslatorComposer::create_prover(t) 122 0.45% ECCVMProver::construct_proof(t) 1769 6.59% GoblinTranslatorProver::construct_proof(t) 928 3.45% Goblin::merge(t) 128 0.48% Total time accounted for: 26855ms/26935ms = 99.70% Major contributors: function ms % sum commit(t) 3987 14.85% compute_combiner(t) 7246 26.98% compute_perturbator(t) 1394 5.19% compute_univariate(t) 1428 5.32% Breakdown of ECCVMProver::create_prover: ECCVMComposer::compute_witness(t) 1740 50.99% ECCVMComposer::create_proving_key(t) 1669 48.90% Breakdown of ProtogalaxyProver::fold_instances: ProtoGalaxyProver_::preparation_round(t) 4117 30.54% ProtoGalaxyProver_::perturbator_round(t) 1394 10.34% ProtoGalaxyProver_::combiner_quotient_round(t) 7248 53.76% ProtoGalaxyProver_::accumulator_update_round(t) 722 5.36% ``` --------- Co-authored-by: maramihali --- .../protogalaxy/protogalaxy_prover.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.cpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.cpp index 172c02009ed..cb2df2fb6e2 100644 --- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover.cpp @@ -85,12 +85,17 @@ std::shared_ptr ProtoGalaxyProver_prover_polynomials.get_all())) { - for (auto [acc_el, inst_el] : zip_view(acc_poly, inst_poly)) { - acc_el += inst_el * lagranges[inst_idx]; + auto accumulator_polys = acc_prover_polynomials.get_all(); + auto input_polys = instances[inst_idx]->prover_polynomials.get_all(); + run_loop_in_parallel(Flavor::NUM_ALL_ENTITIES, [&](size_t start_idx, size_t end_idx) { + for (size_t poly_idx = start_idx; poly_idx < end_idx; poly_idx++) { + auto& acc_poly = accumulator_polys[poly_idx]; + auto& inst_poly = input_polys[poly_idx]; + for (auto [acc_el, inst_el] : zip_view(acc_poly, inst_poly)) { + acc_el += inst_el * lagranges[inst_idx]; + } } - } + }); } next_accumulator->prover_polynomials = std::move(acc_prover_polynomials);