diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
index bdb31f2eac5..bac4516fa83 100644
--- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
+++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp
@@ -2,6 +2,7 @@
 #include "barretenberg/commitment_schemes/claim.hpp"
 #include "barretenberg/commitment_schemes/verification_key.hpp"
 #include "barretenberg/common/assert.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp"
 #include "barretenberg/transcript/transcript.hpp"
 #include <cstddef>
@@ -90,8 +91,6 @@ template <typename Curve_> class IPA {
 #ifdef IPA_FUZZ_TEST
    friend class ProxyCaller;
 #endif
-   // clang-format off
-
    /**
     * @brief Compute an inner product argument proof for opening a single polynomial at a single evaluation point.
     *
@@ -128,16 +127,14 @@ template <typename Curve_> class IPA {
     *
     *7. Send the final \f$\vec{a}_{0} = (a_0)\f$ to the verifier
     */
-   template <typename Transcript>
-   static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
-                                              const ProverOpeningClaim<Curve>& opening_claim,
-                                              const std::shared_ptr<Transcript>& transcript)
-   {
-
-        Polynomial polynomial = opening_claim.polynomial;
+    template <typename Transcript>
+    static void compute_opening_proof_internal(const std::shared_ptr<CK>& ck,
+                                               const ProverOpeningClaim<Curve>& opening_claim,
+                                               const std::shared_ptr<Transcript>& transcript)
+    {
+        const Polynomial& polynomial = opening_claim.polynomial;
 
-        // clang-format on
-        auto poly_length = static_cast<size_t>(polynomial.size());
+        size_t poly_length = polynomial.size();
 
         // Step 1.
         // Send polynomial degree + 1 = d to the verifier
@@ -169,36 +166,27 @@ template <typename Curve_> class IPA {
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&G_vec_local, srs_elements](size_t start, size_t end) {
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                 for (size_t i = start * 2; i < end * 2; i += 2) {
                     G_vec_local[i >> 1] = srs_elements[i];
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            }, thread_heuristics::FF_COPY_COST);
 
         // Step 5.
         // Compute vector b (vector of the powers of the challenge)
         OpeningPair<Curve> opening_pair = opening_claim.opening_pair;
         std::vector<Fr> b_vec(poly_length);
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&b_vec, &opening_pair](size_t start, size_t end) {
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                 Fr b_power = opening_pair.challenge.pow(start);
                 for (size_t i = start; i < end; i++) {
                     b_vec[i] = b_power;
                     b_power *= opening_pair.challenge;
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/1);
+            }, thread_heuristics::FF_COPY_COST + thread_heuristics::FF_MULTIPLICATION_COST);
 
         // Iterate for log(poly_degree) rounds to compute the round commitments.
         auto log_poly_degree = static_cast<size_t>(numeric::get_msb(poly_length));
@@ -221,18 +209,9 @@ template <typename Curve_> class IPA {
             Fr inner_prod_L = Fr::zero();
             Fr inner_prod_R = Fr::zero();
             // Run scalar products in parallel
-            run_loop_in_parallel_if_effective(
+            parallel_for_heuristic(
                 round_size,
-                [&a_vec,
-                 &b_vec,
-                 round_size,
-                 &inner_prod_L,
-                 &inner_prod_R
-#ifndef NO_MULTITHREADING
-                 ,
-                 &inner_product_accumulation_mutex
-#endif
-            ](size_t start, size_t end) {
+                [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                     Fr current_inner_prod_L = Fr::zero();
                     Fr current_inner_prod_R = Fr::zero();
                     for (size_t j = start; j < end; j++) {
@@ -247,9 +226,7 @@ template <typename Curve_> class IPA {
                         inner_prod_L += current_inner_prod_L;
                         inner_prod_R += current_inner_prod_R;
                     }
-                },
-                /*finite_field_additions_per_iteration=*/2,
-                /*finite_field_multiplications_per_iteration=*/2);
+                }, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);
 
             // Step 6.a (using letters, because doxygen automaticall converts the sublist counters to letters :( )
             // L_i = < a_vec_lo, G_vec_hi > + inner_prod_L * aux_generator
@@ -281,11 +258,11 @@ template <typename Curve_> class IPA {
             // Step 6.e
             // G_vec_new = G_vec_lo + G_vec_hi * round_challenge_inv
             auto G_hi_by_inverse_challenge = GroupElement::batch_mul_with_endomorphism(
-                std::span{ G_vec_local.begin() + static_cast<long>(round_size),
-                           G_vec_local.begin() + static_cast<long>(round_size * 2) },
+                std::span{ G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size),
+                           G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size * 2) },
                 round_challenge_inv);
             GroupElement::batch_affine_add(
-                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<long>(round_size) },
+                std::span{ G_vec_local.begin(), G_vec_local.begin() + static_cast<std::ptrdiff_t>(round_size) },
                 G_hi_by_inverse_challenge,
                 G_vec_local);
 
@@ -293,17 +270,12 @@ template <typename Curve_> class IPA {
             // Update the vectors a_vec, b_vec.
             // a_vec_new = a_vec_lo + a_vec_hi * round_challenge
             // b_vec_new = b_vec_lo + b_vec_hi * round_challenge_inv
-            run_loop_in_parallel_if_effective(
+            parallel_for_heuristic(
                 round_size,
-                [&a_vec, &b_vec, round_challenge, round_challenge_inv, round_size](size_t start, size_t end) {
-                    for (size_t j = start; j < end; j++) {
-                        a_vec[j] += round_challenge * a_vec[round_size + j];
-                        b_vec[j] += round_challenge_inv * b_vec[round_size + j];
-                    }
-                },
-                /*finite_field_additions_per_iteration=*/4,
-                /*finite_field_multiplications_per_iteration=*/8,
-                /*finite_field_inversions_per_iteration=*/1);
+                [&](size_t j) {
+                    a_vec[j] += round_challenge * a_vec[round_size + j];
+                    b_vec[j] += round_challenge_inv * b_vec[round_size + j];
+                }, thread_heuristics::FF_ADDITION_COST * 2 + thread_heuristics::FF_MULTIPLICATION_COST * 2);
         }
 
         // Step 7
@@ -409,23 +381,19 @@ template <typename Curve_> class IPA {
         // TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its
         // O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're
         // leaving it unoptimized for now.
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&s_vec, &round_challenges_inv, log_poly_degree](size_t start, size_t end) {
-                for (size_t i = start; i < end; i++) {
-                    Fr s_vec_scalar = Fr::one();
-                    for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
-                        auto bit = (i >> j) & 1;
-                        bool b = static_cast<bool>(bit);
-                        if (b) {
-                            s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
-                        }
+            [&](size_t i) {
+                Fr s_vec_scalar = Fr::one();
+                for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
+                    auto bit = (i >> j) & 1;
+                    bool b = static_cast<bool>(bit);
+                    if (b) {
+                        s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j];
                     }
-                    s_vec[i] = s_vec_scalar;
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/log_poly_degree);
+                s_vec[i] = s_vec_scalar;
+            }, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree);
 
         auto* srs_elements = vk->get_monomial_points();
 
@@ -435,20 +403,13 @@ template <typename Curve_> class IPA {
         // The SRS stored in the commitment key is the result after applying the pippenger point table so the
         // values at odd indices contain the point {srs[i-1].x * beta, srs[i-1].y}, where beta is the endomorphism
         // G_vec_local should use only the original SRS thus we extract only the even indices.
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             poly_length,
-            [&G_vec_local, srs_elements](size_t start, size_t end) {
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                 for (size_t i = start * 2; i < end * 2; i += 2) {
                     G_vec_local[i >> 1] = srs_elements[i];
                 }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            }, thread_heuristics::FF_COPY_COST * 2);
 
         // Step 8.
         // Compute G₀
@@ -497,7 +458,7 @@ template <typename Curve_> class IPA {
         // Ensure polynomial length cannot be changed from its default specified valued
         poly_length_var.fix_witness();
 
-        const uint32_t poly_length = static_cast<uint32_t>(poly_length_var.get_value());
+        const auto poly_length = static_cast<uint32_t>(poly_length_var.get_value());
 
         // Step 2.
         // Receive generator challenge u and compute auxiliary generator
@@ -559,7 +520,7 @@ template <typename Curve_> class IPA {
         // O(nlogn). This can be optimized to be linear by computing a tree of products.
         for (size_t i = 0; i < poly_length; i++) {
             Fr s_vec_scalar = Fr(1);
-            for (size_t j = (log_poly_degree - 1); j != size_t(-1); j--) {
+            for (size_t j = (log_poly_degree - 1); j != static_cast<size_t>(-1); j--) {
                 auto bit = (i >> j) & 1;
                 bool b = static_cast<bool>(bit);
                 if (b) {
diff --git a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
index 9492475cc1c..2ff5feb85b1 100644
--- a/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
+++ b/barretenberg/cpp/src/barretenberg/common/compiler_hints.hpp
@@ -23,4 +23,6 @@
 #else
 #define BB_LIKELY(x) x
 #define BB_UNLIKELY(x) x
-#endif
\ No newline at end of file
+#endif
+
+#define BB_UNUSED [[maybe_unused]]
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp b/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp
index 1eb03b1ae3a..fbdd310756f 100644
--- a/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp
+++ b/barretenberg/cpp/src/barretenberg/common/slab_allocator.hpp
@@ -5,6 +5,7 @@
 #include <map>
 #include <memory>
 #include <unordered_map>
+#include <vector>
 #ifndef NO_MULTITHREADING
 #include <mutex>
 #endif
@@ -75,4 +76,9 @@ template <typename T> class ContainerSlabAllocator {
     }
 };
 
+/**
+ * @brief A vector that uses the slab allocator.
+ */
+template <typename T> using SlabVector = std::vector<T, bb::ContainerSlabAllocator<T>>;
+
 } // namespace bb
\ No newline at end of file
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp
index 8aec8be2cfd..19eb38e00ab 100644
--- a/barretenberg/cpp/src/barretenberg/common/thread.cpp
+++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp
@@ -99,9 +99,9 @@ void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func
  * @param no_multhreading_if_less_or_equal If num points is less or equal to this value, run without parallelization
  *
  */
-void run_loop_in_parallel(size_t num_points,
-                          const std::function<void(size_t, size_t)>& func,
-                          size_t no_multhreading_if_less_or_equal)
+void parallel_for_range(size_t num_points,
+                        const std::function<void(size_t, size_t)>& func,
+                        size_t no_multhreading_if_less_or_equal)
 {
     if (num_points <= no_multhreading_if_less_or_equal) {
         func(0, num_points);
@@ -129,45 +129,10 @@ void run_loop_in_parallel(size_t num_points,
     });
 };
 
-/**
- * @brief Split a loop into several loops running in parallel based on operations in 1 iteration
- *
- * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
- * that should contain the work loop, but only if it's worth it
- * @param num_points Total number of elements
- * @param func A function or lambda expression with a for loop inside, for example:
- * [](size_t start, size_t end){for (size_t i=start; i<end; i++){(void)i;}}
- * Or for the version with index:
- * [](size_t start, size_t end,size_t workload_index){for (size_t i=start; i<end; i++){(void)i;}}
- * @param finite_field_additions_per_iteration The number of additions/subtractions/negations
- * @param finite_field_multiplications_per_iteration The number of finite field multiplications and squarings
- * @param finite_field_inversions_per_iteration
- * @param group_element_additions_per_iteration Projective addition number
- * @param group_element_doublings_per_iteration Projective doubling number
- * @param scalar_multiplications_per_iteration
- * @param sequential_copy_ops_per_iteration Field element (16 byte) sequential copy number
- */
-template <typename FunctionType>
-    requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
-             std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
-void run_loop_in_parallel_if_effective_internal(size_t num_points,
-                                                const FunctionType& func,
-                                                size_t finite_field_additions_per_iteration,
-                                                size_t finite_field_multiplications_per_iteration,
-                                                size_t finite_field_inversions_per_iteration,
-                                                size_t group_element_additions_per_iteration,
-                                                size_t group_element_doublings_per_iteration,
-                                                size_t scalar_multiplications_per_iteration,
-                                                size_t sequential_copy_ops_per_iteration)
+void parallel_for_heuristic(size_t num_points,
+                            const std::function<void(size_t, size_t, size_t)>& func,
+                            size_t heuristic_cost)
 {
-    // Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds):
-    constexpr size_t FF_ADDITION_COST = 4;
-    constexpr size_t FF_MULTIPLICATION_COST = 21;
-    constexpr size_t FF_INVERSION_COST = 7000;
-    constexpr size_t GE_ADDITION_COST = 350;
-    constexpr size_t GE_DOUBLING_COST = 194;
-    constexpr size_t SM_COST = 50000;
-    constexpr size_t SEQ_COPY_COST = 3;
     // We take the maximum observed parallel_for cost (388 us) and round it up.
     // The goals of these checks is to evade significantly (10x) increasing processing time for small workloads. So we
     // can accept not triggering parallel_for if the workload would become faster by half a millisecond for medium
@@ -180,23 +145,11 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
     const size_t chunk_size = (num_points / num_cpus) + (num_points % num_cpus == 0 ? 0 : 1);
 
     // Compute the cost of all operations done by other threads
-    const size_t offset_cost =
-        (num_points - chunk_size) *
-        (finite_field_additions_per_iteration * FF_ADDITION_COST +
-         finite_field_multiplications_per_iteration * FF_MULTIPLICATION_COST +
-         finite_field_inversions_per_iteration * FF_INVERSION_COST +
-         group_element_additions_per_iteration * GE_ADDITION_COST +
-         group_element_doublings_per_iteration * GE_DOUBLING_COST + scalar_multiplications_per_iteration * SM_COST +
-         sequential_copy_ops_per_iteration * SEQ_COPY_COST);
+    const size_t offset_cost = (num_points - chunk_size) * heuristic_cost;
 
     // If starting parallel for is longer than computing, just compute
     if (offset_cost < PARALLEL_FOR_COST) {
-        if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {
-
-            func(0, num_points);
-        } else {
-            func(0, num_points, 0);
-        }
+        func(0, num_points, 0);
         return;
     }
     // Parallelize over chunks
@@ -213,18 +166,9 @@ void run_loop_in_parallel_if_effective_internal(size_t num_points,
         size_t start = chunk_index * chunk_size;
         size_t end = chunk_index * chunk_size + current_chunk_size;
 
-        if constexpr (std::is_same_v<FunctionType, std::function<void(size_t, size_t)>>) {
-
-            func(start, end);
-        } else {
-            func(start, end, chunk_index);
-        }
+        func(start, end, chunk_index);
     });
 };
-template void run_loop_in_parallel_if_effective_internal(
-    size_t, const std::function<void(size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
-template void run_loop_in_parallel_if_effective_internal(
-    size_t, const std::function<void(size_t, size_t, size_t)>&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
 
 /**
  * @brief calculates number of threads to create based on minimum iterations per thread
diff --git a/barretenberg/cpp/src/barretenberg/common/thread.hpp b/barretenberg/cpp/src/barretenberg/common/thread.hpp
index 98ade3f4593..7227e139d00 100644
--- a/barretenberg/cpp/src/barretenberg/common/thread.hpp
+++ b/barretenberg/cpp/src/barretenberg/common/thread.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include "barretenberg/common/compiler_hints.hpp"
 #include <atomic>
 #include <barretenberg/env/hardware_concurrency.hpp>
 #include <barretenberg/numeric/bitop/get_msb.hpp>
@@ -24,75 +25,39 @@ inline size_t get_num_cpus_pow2()
  * @param num_iterations Number of iterations
  * @param func Function to run in parallel
  * Observe that num_iterations is NOT the thread pool size.
- * The size will be chosen based on the hardware concurrency (i.e., env or cpus)..
+ * The size will be chosen based on the hardware concurrency (i.e., env or cpus).
  */
 void parallel_for(size_t num_iterations, const std::function<void(size_t)>& func);
-void run_loop_in_parallel(size_t num_points,
-                          const std::function<void(size_t, size_t)>& func,
-                          size_t no_multhreading_if_less_or_equal = 0);
+void parallel_for_range(size_t num_points,
+                        const std::function<void(size_t, size_t)>& func,
+                        size_t no_multhreading_if_less_or_equal = 0);
 
-template <typename FunctionType>
-    requires(std::is_same_v<FunctionType, std::function<void(size_t, size_t)>> ||
-             std::is_same_v<FunctionType, std::function<void(size_t, size_t, size_t)>>)
-void run_loop_in_parallel_if_effective_internal(
-    size_t, const FunctionType&, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
 /**
- * @brief Runs loop in parallel if parallelization if useful (costs less than the algorithm)
- *
- * @details Please see run_loop_in_parallel_if_effective_internal for detailed description
- *
- */
-inline void run_loop_in_parallel_if_effective(size_t num_points,
-                                              const std::function<void(size_t, size_t)>& func,
-                                              size_t finite_field_additions_per_iteration = 0,
-                                              size_t finite_field_multiplications_per_iteration = 0,
-                                              size_t finite_field_inversions_per_iteration = 0,
-                                              size_t group_element_additions_per_iteration = 0,
-                                              size_t group_element_doublings_per_iteration = 0,
-                                              size_t scalar_multiplications_per_iteration = 0,
-                                              size_t sequential_copy_ops_per_iteration = 0
-
-)
-{
-    run_loop_in_parallel_if_effective_internal(num_points,
-                                               func,
-                                               finite_field_additions_per_iteration,
-                                               finite_field_multiplications_per_iteration,
-                                               finite_field_inversions_per_iteration,
-                                               group_element_additions_per_iteration,
-                                               group_element_doublings_per_iteration,
-                                               scalar_multiplications_per_iteration,
-                                               sequential_copy_ops_per_iteration);
-}
-
-/**
- * @brief Runs loop in parallel if parallelization if useful (costs less than the algorith). The loop function is given
- * the index of the workload.
- *
- * @details Please see run_loop_in_parallel_if_effective_internal for detailed description
+ * @brief Split a loop into several loops running in parallel based on operations in 1 iteration
  *
+ * @details Splits the num_points into appropriate number of chunks to do parallel processing on and calls the function
+ * that should contain the work loop, but only if it's worth it
+ * @param num_points Total number of elements
+ * @param func A function or lambda expression with a for loop inside, for example:
+ * [&](size_t start, size_t end, size_t thread_index){for (size_t i=start; i<end; i++){ ... work ... }
+ * @param heuristic_cost the estimated cost of the operation, see namespace thread_heuristics below
  */
-inline void run_loop_in_parallel_if_effective_with_index(size_t num_points,
-                                                         const std::function<void(size_t, size_t, size_t)>& func,
-                                                         size_t finite_field_additions_per_iteration = 0,
-                                                         size_t finite_field_multiplications_per_iteration = 0,
-                                                         size_t finite_field_inversions_per_iteration = 0,
-                                                         size_t group_element_additions_per_iteration = 0,
-                                                         size_t group_element_doublings_per_iteration = 0,
-                                                         size_t scalar_multiplications_per_iteration = 0,
-                                                         size_t sequential_copy_ops_per_iteration = 0
+void parallel_for_heuristic(size_t num_points,
+                            const std::function<void(size_t, size_t, size_t)>& func,
+                            size_t heuristic_cost);
 
-)
+template <typename Func>
+    requires std::invocable<Func, std::size_t>
+void parallel_for_heuristic(size_t num_points, const Func& func, size_t heuristic_cost)
 {
-    run_loop_in_parallel_if_effective_internal(num_points,
-                                               func,
-                                               finite_field_additions_per_iteration,
-                                               finite_field_multiplications_per_iteration,
-                                               finite_field_inversions_per_iteration,
-                                               group_element_additions_per_iteration,
-                                               group_element_doublings_per_iteration,
-                                               scalar_multiplications_per_iteration,
-                                               sequential_copy_ops_per_iteration);
+    parallel_for_heuristic(
+        num_points,
+        [&](size_t start_idx, size_t end_idx, BB_UNUSED size_t chunk_index) {
+            for (size_t i = start_idx; i < end_idx; i++) {
+                func(i);
+            }
+        },
+        heuristic_cost);
 }
 
 const size_t DEFAULT_MIN_ITERS_PER_THREAD = 1 << 4;
@@ -119,4 +84,22 @@ size_t calculate_num_threads(size_t num_iterations, size_t min_iterations_per_th
 size_t calculate_num_threads_pow2(size_t num_iterations,
                                   size_t min_iterations_per_thread = DEFAULT_MIN_ITERS_PER_THREAD);
 
+namespace thread_heuristics {
+// Rough cost of operations (the operation costs are derives in basics_bench and the units are nanoseconds)
+// Field element (16 byte) addition cost
+constexpr size_t FF_ADDITION_COST = 4;
+// Field element (16 byte) multiplication cost
+constexpr size_t FF_MULTIPLICATION_COST = 21;
+// Field element (16 byte) inversion cost
+constexpr size_t FF_INVERSION_COST = 7000;
+// Group element projective addition number
+constexpr size_t GE_ADDITION_COST = 350;
+// Group element projective doubling number
+constexpr size_t GE_DOUBLING_COST = 194;
+// Group element scalar multiplication cost
+constexpr size_t SM_COST = 50000;
+// Field element (16 byte) sequential copy number
+constexpr size_t FF_COPY_COST = 3;
+} // namespace thread_heuristics
+
 } // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp
index 81773ed655f..06aec65a0ff 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/acir_format.hpp
@@ -102,12 +102,8 @@ struct AcirFormat {
     // A standard plonk arithmetic constraint, as defined in the poly_triple struct, consists of selector values
     // for q_M,q_L,q_R,q_O,q_C and indices of three variables taking the role of left, right and output wire
     // This could be a large vector so use slab allocator, we don't expect the blackbox implementations to be so large.
-    std::vector<bb::poly_triple_<bb::curve::BN254::ScalarField>,
-                bb::ContainerSlabAllocator<bb::poly_triple_<bb::curve::BN254::ScalarField>>>
-        poly_triple_constraints;
-    std::vector<bb::mul_quad_<bb::curve::BN254::ScalarField>,
-                bb::ContainerSlabAllocator<bb::mul_quad_<bb::curve::BN254::ScalarField>>>
-        quad_constraints;
+    bb::SlabVector<bb::poly_triple_<bb::curve::BN254::ScalarField>> poly_triple_constraints;
+    bb::SlabVector<bb::mul_quad_<bb::curve::BN254::ScalarField>> quad_constraints;
     std::vector<BlockConstraint> block_constraints;
 
     // Number of gates added to the circuit per original opcode.
@@ -148,7 +144,7 @@ struct AcirFormat {
     friend bool operator==(AcirFormat const& lhs, AcirFormat const& rhs) = default;
 };
 
-using WitnessVector = std::vector<bb::fr, bb::ContainerSlabAllocator<bb::fr>>;
+using WitnessVector = bb::SlabVector<bb::fr>;
 using WitnessVectorStack = std::vector<std::pair<uint32_t, WitnessVector>>;
 
 struct AcirProgram {
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp
index 86ac113e76d..dc3e1222f7e 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/honk_recursion_constraint.test.cpp
@@ -141,7 +141,7 @@ class AcirHonkRecursionConstraint : public ::testing::Test {
         std::vector<RecursionConstraint> honk_recursion_constraints;
 
         size_t witness_offset = 0;
-        std::vector<fr, ContainerSlabAllocator<fr>> witness;
+        SlabVector<fr> witness;
 
         for (auto& inner_circuit : inner_circuits) {
 
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp b/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp
index 95b650f13d3..2e6384e13c2 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_format/recursion_constraint.test.cpp
@@ -139,7 +139,7 @@ Builder create_outer_circuit(std::vector<Builder>& inner_circuits)
     std::vector<RecursionConstraint> recursion_constraints;
 
     size_t witness_offset = 0;
-    std::vector<fr, ContainerSlabAllocator<fr>> witness;
+    SlabVector<fr> witness;
 
     for (auto& inner_circuit : inner_circuits) {
         auto inner_composer = Composer();
diff --git a/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp b/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp
index f53011ec374..f222381c429 100644
--- a/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp
+++ b/barretenberg/cpp/src/barretenberg/dsl/acir_proofs/acir_composer.hpp
@@ -11,7 +11,7 @@ namespace acir_proofs {
  */
 class AcirComposer {
 
-    using WitnessVector = std::vector<bb::fr, bb::ContainerSlabAllocator<bb::fr>>;
+    using WitnessVector = bb::SlabVector<bb::fr>;
 
   public:
     AcirComposer(size_t size_hint = 0, bool verbose = true);
diff --git a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
index 19bc945e476..6630406b89a 100644
--- a/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/ecc/groups/element_impl.hpp
@@ -728,20 +728,8 @@ void element<Fq, Fr, T>::batch_affine_add(const std::span<affine_element<Fq, Fr,
     // Space for temporary values
     std::vector<Fq> scratch_space(num_points);
 
-    run_loop_in_parallel_if_effective(
-        num_points,
-        [&results, &first_group](size_t start, size_t end) {
-            for (size_t i = start; i < end; i++) {
-                results[i] = first_group[i];
-            }
-        },
-        /*finite_field_additions_per_iteration=*/0,
-        /*finite_field_multiplications_per_iteration=*/0,
-        /*finite_field_inversions_per_iteration=*/0,
-        /*group_element_additions_per_iteration=*/0,
-        /*group_element_doublings_per_iteration=*/0,
-        /*scalar_multiplications_per_iteration=*/0,
-        /*sequential_copy_ops_per_iteration=*/2);
+    parallel_for_heuristic(
+        num_points, [&](size_t i) { results[i] = first_group[i]; }, thread_heuristics::FF_COPY_COST * 2);
 
     // TODO(#826): Same code as in batch mul
     //  we can mutate rhs but NOT lhs!
@@ -779,16 +767,14 @@ void element<Fq, Fr, T>::batch_affine_add(const std::span<affine_element<Fq, Fr,
      * @brief Perform batch affine addition in parallel
      *
      */
-    const auto batch_affine_add_internal =
-        [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) {
-            run_loop_in_parallel_if_effective(
-                num_points,
-                [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) {
-                    batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
-                },
-                /*finite_field_additions_per_iteration=*/6,
-                /*finite_field_multiplications_per_iteration=*/6);
-        };
+    const auto batch_affine_add_internal = [&](const affine_element* lhs, affine_element* rhs) {
+        parallel_for_heuristic(
+            num_points,
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
+                batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
+            },
+            thread_heuristics::FF_ADDITION_COST * 6 + thread_heuristics::FF_MULTIPLICATION_COST * 6);
+    };
     batch_affine_add_internal(&second_group[0], &results[0]);
 }
 
@@ -851,13 +837,12 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
      */
     const auto batch_affine_add_internal =
         [num_points, &scratch_space, &batch_affine_add_chunked](const affine_element* lhs, affine_element* rhs) {
-            run_loop_in_parallel_if_effective(
+            parallel_for_heuristic(
                 num_points,
-                [lhs, &rhs, &scratch_space, &batch_affine_add_chunked](size_t start, size_t end) {
+                [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                     batch_affine_add_chunked(lhs + start, rhs + start, end - start, &scratch_space[0] + start);
                 },
-                /*finite_field_additions_per_iteration=*/6,
-                /*finite_field_multiplications_per_iteration=*/6);
+                thread_heuristics::FF_ADDITION_COST * 6 + thread_heuristics::FF_MULTIPLICATION_COST * 6);
         };
 
     /**
@@ -896,13 +881,12 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
      *
      */
     const auto batch_affine_double = [num_points, &scratch_space, &batch_affine_double_chunked](affine_element* lhs) {
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             num_points,
-            [&lhs, &scratch_space, &batch_affine_double_chunked](size_t start, size_t end) {
+            [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
                 batch_affine_double_chunked(lhs + start, end - start, &scratch_space[0] + start);
             },
-            /*finite_field_additions_per_iteration=*/7,
-            /*finite_field_multiplications_per_iteration=*/6);
+            thread_heuristics::FF_ADDITION_COST * 7 + thread_heuristics::FF_MULTIPLICATION_COST * 6);
     };
 
     // We compute the resulting point through WNAF by evaluating (the (\sum_i (16ⁱ⋅
@@ -912,22 +896,9 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     // computing p⋅Point, we get a point at infinity, which is an edgecase, and we don't want to handle edgecases in the
     // hot loop since the slow the computation down. So it's better to just handle it here.
     if (scalar == -Fr::one()) {
-
         std::vector<affine_element> results(num_points);
-        run_loop_in_parallel_if_effective(
-            num_points,
-            [&results, &points](size_t start, size_t end) {
-                for (size_t i = start; i < end; ++i) {
-                    results[i] = -points[i];
-                }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+        parallel_for_heuristic(
+            num_points, [&](size_t i) { results[i] = -points[i]; }, thread_heuristics::FF_COPY_COST);
         return results;
     }
     // Compute wnaf for scalar
@@ -938,20 +909,8 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
         affine_element result{ Fq::zero(), Fq::zero() };
         result.self_set_infinity();
         std::vector<affine_element> results(num_points);
-        run_loop_in_parallel_if_effective(
-            num_points,
-            [&results, result](size_t start, size_t end) {
-                for (size_t i = start; i < end; ++i) {
-                    results[i] = result;
-                }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+        parallel_for_heuristic(
+            num_points, [&](size_t i) { results[i] = result; }, thread_heuristics::FF_COPY_COST);
         return results;
     }
 
@@ -963,41 +922,23 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
     }
     // Initialize first etnries in lookup table
     std::vector<affine_element> temp_point_vector(num_points);
-    run_loop_in_parallel_if_effective(
+    parallel_for_heuristic(
         num_points,
-        [&temp_point_vector, &lookup_table, &points](size_t start, size_t end) {
-            for (size_t i = start; i < end; ++i) {
-                // If the point is at infinity we fix-up the result later
-                // To avoid 'trying to invert zero in the field' we set the point to 'one' here
-                temp_point_vector[i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i];
-                lookup_table[0][i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i];
-            }
+        [&](size_t i) {
+            // If the point is at infinity we fix-up the result later
+            // To avoid 'trying to invert zero in the field' we set the point to 'one' here
+            temp_point_vector[i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i];
+            lookup_table[0][i] = points[i].is_point_at_infinity() ? affine_element::one() : points[i];
         },
-        /*finite_field_additions_per_iteration=*/0,
-        /*finite_field_multiplications_per_iteration=*/0,
-        /*finite_field_inversions_per_iteration=*/0,
-        /*group_element_additions_per_iteration=*/0,
-        /*group_element_doublings_per_iteration=*/0,
-        /*scalar_multiplications_per_iteration=*/0,
-        /*sequential_copy_ops_per_iteration=*/2);
+        thread_heuristics::FF_COPY_COST * 2);
 
     // Construct lookup table
     batch_affine_double(&temp_point_vector[0]);
     for (size_t j = 1; j < LOOKUP_SIZE; ++j) {
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             num_points,
-            [j, &lookup_table](size_t start, size_t end) {
-                for (size_t i = start; i < end; ++i) {
-                    lookup_table[j][i] = lookup_table[j - 1][i];
-                }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            [&](size_t i) { lookup_table[j][i] = lookup_table[j - 1][i]; },
+            thread_heuristics::FF_COPY_COST);
         batch_affine_add_internal(&temp_point_vector[0], &lookup_table[j][0]);
     }
 
@@ -1016,31 +957,22 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
         index = wnaf_entry & 0x0fffffffU;
         sign = static_cast<bool>((wnaf_entry >> 31) & 1);
         const bool is_odd = ((j & 1) == 1);
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             num_points,
-            [j, index, is_odd, sign, beta, &lookup_table, &work_elements, &temp_point_vector](size_t start,
-                                                                                              size_t end) {
-                for (size_t i = start; i < end; ++i) {
-
-                    auto to_add = lookup_table[static_cast<size_t>(index)][i];
-                    to_add.y.self_conditional_negate(sign ^ is_odd);
-                    if (is_odd) {
-                        to_add.x *= beta;
-                    }
-                    if (j == 0) {
-                        work_elements[i] = to_add;
-                    } else {
-                        temp_point_vector[i] = to_add;
-                    }
+            [&](size_t i) {
+                auto to_add = lookup_table[static_cast<size_t>(index)][i];
+                to_add.y.self_conditional_negate(sign ^ is_odd);
+                if (is_odd) {
+                    to_add.x *= beta;
+                }
+                if (j == 0) {
+                    work_elements[i] = to_add;
+                } else {
+                    temp_point_vector[i] = to_add;
                 }
             },
-            /*finite_field_additions_per_iteration=*/1,
-            /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            (is_odd ? thread_heuristics::FF_MULTIPLICATION_COST : 0) + thread_heuristics::FF_COPY_COST +
+                thread_heuristics::FF_ADDITION_COST);
     }
     // First cycle of addition
     batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
@@ -1055,83 +987,47 @@ std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomo
                 batch_affine_double(&work_elements[0]);
             }
         }
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             num_points,
-            [index, is_odd, sign, beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
-                for (size_t i = start; i < end; ++i) {
-
-                    auto to_add = lookup_table[static_cast<size_t>(index)][i];
-                    to_add.y.self_conditional_negate(sign ^ is_odd);
-                    if (is_odd) {
-                        to_add.x *= beta;
-                    }
-                    temp_point_vector[i] = to_add;
+            [&](size_t i) {
+                auto to_add = lookup_table[static_cast<size_t>(index)][i];
+                to_add.y.self_conditional_negate(sign ^ is_odd);
+                if (is_odd) {
+                    to_add.x *= beta;
                 }
+                temp_point_vector[i] = to_add;
             },
-            /*finite_field_additions_per_iteration=*/1,
-            /*finite_field_multiplications_per_iteration=*/is_odd ? 1 : 0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            (is_odd ? thread_heuristics::FF_MULTIPLICATION_COST : 0) + thread_heuristics::FF_COPY_COST +
+                thread_heuristics::FF_ADDITION_COST);
         batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
     }
 
     // Apply skew for the first endo scalar
     if (wnaf.skew) {
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             num_points,
-            [&lookup_table, &temp_point_vector](size_t start, size_t end) {
-                for (size_t i = start; i < end; ++i) {
-
-                    temp_point_vector[i] = -lookup_table[0][i];
-                }
-            },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/0,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            [&](size_t i) { temp_point_vector[i] = -lookup_table[0][i]; },
+            thread_heuristics::FF_ADDITION_COST + thread_heuristics::FF_COPY_COST);
         batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
     }
     // Apply skew for the second endo scalar
     if (wnaf.endo_skew) {
-        run_loop_in_parallel_if_effective(
+        parallel_for_heuristic(
             num_points,
-            [beta, &lookup_table, &temp_point_vector](size_t start, size_t end) {
-                for (size_t i = start; i < end; ++i) {
-                    temp_point_vector[i] = lookup_table[0][i];
-                    temp_point_vector[i].x *= beta;
-                }
+            [&](size_t i) {
+                temp_point_vector[i] = lookup_table[0][i];
+                temp_point_vector[i].x *= beta;
             },
-            /*finite_field_additions_per_iteration=*/0,
-            /*finite_field_multiplications_per_iteration=*/1,
-            /*finite_field_inversions_per_iteration=*/0,
-            /*group_element_additions_per_iteration=*/0,
-            /*group_element_doublings_per_iteration=*/0,
-            /*scalar_multiplications_per_iteration=*/0,
-            /*sequential_copy_ops_per_iteration=*/1);
+            thread_heuristics::FF_MULTIPLICATION_COST + thread_heuristics::FF_COPY_COST);
         batch_affine_add_internal(&temp_point_vector[0], &work_elements[0]);
     }
     // handle points at infinity explicitly
-    run_loop_in_parallel_if_effective(
+    parallel_for_heuristic(
         num_points,
-        [&](size_t start, size_t end) {
-            for (size_t i = start; i < end; ++i) {
-                work_elements[i] =
-                    points[i].is_point_at_infinity() ? work_elements[i].set_infinity() : work_elements[i];
-            }
+        [&](size_t i) {
+            work_elements[i] = points[i].is_point_at_infinity() ? work_elements[i].set_infinity() : work_elements[i];
         },
-        /*finite_field_additions_per_iteration=*/0,
-        /*finite_field_multiplications_per_iteration=*/1,
-        /*finite_field_inversions_per_iteration=*/0,
-        /*group_element_additions_per_iteration=*/0,
-        /*group_element_doublings_per_iteration=*/0,
-        /*scalar_multiplications_per_iteration=*/0,
-        /*sequential_copy_ops_per_iteration=*/1);
+        thread_heuristics::FF_COPY_COST);
 
     return work_elements;
 }
diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp
index 5d560639642..6121469064a 100644
--- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_circuit_builder.hpp
@@ -144,7 +144,7 @@ class ECCVMCircuitBuilder {
             msm.resize(msm_sizes[i]);
         }
 
-        run_loop_in_parallel(msm_opqueue_index.size(), [&](size_t start, size_t end) {
+        parallel_for_range(msm_opqueue_index.size(), [&](size_t start, size_t end) {
             for (size_t i = start; i < end; i++) {
                 const auto& op = raw_ops[msm_opqueue_index[i]];
                 auto [msm_index, mul_index] = msm_mul_index[i];
diff --git a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp
index 89b3b5a1593..6affa954598 100644
--- a/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp
+++ b/barretenberg/cpp/src/barretenberg/eccvm/eccvm_flavor.hpp
@@ -524,7 +524,7 @@ class ECCVMFlavor {
             }
 
             // compute polynomials for transcript columns
-            run_loop_in_parallel(transcript_rows.size(), [&](size_t start, size_t end) {
+            parallel_for_range(transcript_rows.size(), [&](size_t start, size_t end) {
                 for (size_t i = start; i < end; i++) {
                     transcript_accumulator_empty[i] = transcript_rows[i].accumulator_empty;
                     transcript_add[i] = transcript_rows[i].q_add;
@@ -575,7 +575,7 @@ class ECCVMFlavor {
                 transcript_accumulator_y[i] = transcript_accumulator_y[i - 1];
             }
 
-            run_loop_in_parallel(point_table_rows.size(), [&](size_t start, size_t end) {
+            parallel_for_range(point_table_rows.size(), [&](size_t start, size_t end) {
                 for (size_t i = start; i < end; i++) {
                     // first row is always an empty row (to accommodate shifted polynomials which must have 0 as 1st
                     // coefficient). All other rows in the point_table_rows represent active wnaf gates (i.e.
@@ -605,7 +605,7 @@ class ECCVMFlavor {
             });
 
             // compute polynomials for the msm columns
-            run_loop_in_parallel(msm_rows.size(), [&](size_t start, size_t end) {
+            parallel_for_range(msm_rows.size(), [&](size_t start, size_t end) {
                 for (size_t i = start; i < end; i++) {
                     msm_transition[i] = static_cast<int>(msm_rows[i].msm_transition);
                     msm_add[i] = static_cast<int>(msm_rows[i].q_add);
diff --git a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
index 006e21b7685..d0bb9459ad2 100644
--- a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
@@ -341,13 +341,13 @@ class ECCVMMSMMBuilder {
         }
 
         // Normalize the points in the point trace
-        run_loop_in_parallel(points_to_normalize.size(), [&](size_t start, size_t end) {
+        parallel_for_range(points_to_normalize.size(), [&](size_t start, size_t end) {
             Element::batch_normalize(&points_to_normalize[start], end - start);
         });
 
         // inverse_trace is used to compute the value of the `collision_inverse` column in the ECCVM.
         std::vector<FF> inverse_trace(num_point_adds_and_doubles);
-        run_loop_in_parallel(num_point_adds_and_doubles, [&](size_t start, size_t end) {
+        parallel_for_range(num_point_adds_and_doubles, [&](size_t start, size_t end) {
             for (size_t operation_idx = start; operation_idx < end; ++operation_idx) {
                 if (operation_trace[operation_idx]) {
                     inverse_trace[operation_idx] = (p1_trace[operation_idx].y + p1_trace[operation_idx].y);
diff --git a/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp
index c98e1d56b8b..614eea69d34 100644
--- a/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/eccvm/precomputed_tables_builder.hpp
@@ -46,7 +46,7 @@ class ECCVMPointTablePrecomputationBuilder {
         // current impl doesn't work if not 4
         static_assert(WNAF_DIGITS_PER_ROW == 4);
 
-        run_loop_in_parallel(ecc_muls.size(), [&](size_t start, size_t end) {
+        parallel_for_range(ecc_muls.size(), [&](size_t start, size_t end) {
             for (size_t j = start; j < end; j++) {
                 const auto& entry = ecc_muls[j];
                 const auto& slices = entry.wnaf_digits;
diff --git a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp
index 39ddf504706..7c375848f2a 100644
--- a/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp
+++ b/barretenberg/cpp/src/barretenberg/execution_trace/execution_trace.hpp
@@ -10,7 +10,7 @@ template <class Flavor> class ExecutionTrace_ {
     using Polynomial = typename Flavor::Polynomial;
     using FF = typename Flavor::FF;
     using TrackBlocks = typename Builder::Arithmetization::TraceBlocks;
-    using Wires = std::array<std::vector<uint32_t, bb::ContainerSlabAllocator<uint32_t>>, Builder::NUM_WIRES>;
+    using Wires = std::array<SlabVector<uint32_t>, Builder::NUM_WIRES>;
     using ProvingKey = typename Flavor::ProvingKey;
 
   public:
diff --git a/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp b/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp
index bb5d7cf5d3c..f49b675c45b 100644
--- a/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp
+++ b/barretenberg/cpp/src/barretenberg/grumpkin_srs_gen/grumpkin_srs_gen.cpp
@@ -40,7 +40,7 @@ int main(int argc, char** argv)
 #ifndef NO_MULTITHREADING
     std::mutex vector_access_mutex;
 #endif
-    run_loop_in_parallel(subgroup_size, [&](size_t start, size_t end) {
+    parallel_for_range(subgroup_size, [&](size_t start, size_t end) {
         std::vector<uint8_t> hash_input;
         for (size_t point_idx = start; point_idx < end; ++point_idx) {
             bool rational_point_found = false;
diff --git a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp
index 14789bfa263..90351184809 100644
--- a/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp
+++ b/barretenberg/cpp/src/barretenberg/plonk_honk_shared/arithmetization/arithmetization.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include "barretenberg/common/ref_array.hpp"
+#include "barretenberg/common/slab_allocator.hpp"
 #include <cstddef>
 
 #ifdef CHECK_CIRCUIT_STACKTRACES
@@ -39,8 +40,8 @@ enum class TraceStructure { NONE, SMALL_TEST, CLIENT_IVC_BENCH, AZTEC_IVC_BENCH,
  */
 template <typename FF, size_t NUM_WIRES, size_t NUM_SELECTORS> class ExecutionTraceBlock {
   public:
-    using SelectorType = std::vector<FF, bb::ContainerSlabAllocator<FF>>;
-    using WireType = std::vector<uint32_t, bb::ContainerSlabAllocator<uint32_t>>;
+    using SelectorType = SlabVector<FF>;
+    using WireType = SlabVector<uint32_t>;
     using Selectors = std::array<SelectorType, NUM_SELECTORS>;
     using Wires = std::array<WireType, NUM_WIRES>;
 
diff --git a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp
index 17d50d6238f..37fa0c619e3 100644
--- a/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp
+++ b/barretenberg/cpp/src/barretenberg/protogalaxy/protogalaxy_prover_impl.hpp
@@ -17,7 +17,7 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
     std::mutex evaluation_mutex;
 #endif
     auto linearly_dependent_contribution_accumulator = FF(0);
-    run_loop_in_parallel(instance_size, [&](size_t start_row, size_t end_row) {
+    parallel_for_range(instance_size, [&](size_t start_row, size_t end_row) {
         auto thread_accumulator = FF(0);
         for (size_t row = start_row; row < end_row; row++) {
             auto row_evaluations = instance_polynomials.get_row(row);
@@ -60,7 +60,7 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
     auto degree = level + 1;
     auto prev_level_width = prev_level_coeffs.size();
     std::vector<std::vector<FF>> level_coeffs(prev_level_width >> 1, std::vector<FF>(degree + 1, 0));
-    run_loop_in_parallel(
+    parallel_for_range(
         prev_level_width >> 1,
         [&](size_t start, size_t end) {
             for (size_t node = start << 1; node < end << 1; node += 2) {
@@ -85,7 +85,7 @@ std::vector<typename ProtoGalaxyProver_<ProverInstances_>::FF> ProtoGalaxyProver
 {
     auto width = full_honk_evaluations.size();
     std::vector<std::vector<FF>> first_level_coeffs(width >> 1, std::vector<FF>(2, 0));
-    run_loop_in_parallel(width >> 1, [&](size_t start, size_t end) {
+    parallel_for_range(width >> 1, [&](size_t start, size_t end) {
         for (size_t node = start << 1; node < end << 1; node += 2) {
             auto parent = node >> 1;
             first_level_coeffs[parent][0] = full_honk_evaluations[node] + full_honk_evaluations[node + 1] * betas[0];
diff --git a/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp b/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp
index d72cb072bf3..9b9f6b9de30 100644
--- a/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/translator_vm/translator_circuit_builder.hpp
@@ -322,7 +322,7 @@ class TranslatorCircuitBuilder : public CircuitBuilderBase<bb::fr> {
     // The input we evaluate polynomials on
     Fq evaluation_input_x;
 
-    std::array<std::vector<uint32_t, bb::ContainerSlabAllocator<uint32_t>>, NUM_WIRES> wires;
+    std::array<SlabVector<uint32_t>, NUM_WIRES> wires;
 
     /**
      * @brief Construct a new Translator Circuit Builder object
diff --git a/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp b/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp
index 560b208f605..2bc903a4592 100644
--- a/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp
+++ b/barretenberg/cpp/src/barretenberg/translator_vm/translator_prover.cpp
@@ -34,7 +34,7 @@ void TranslatorProver::compute_witness(CircuitBuilder& circuit_builder)
     // Populate the wire polynomials from the wire vectors in the circuit constructor. Note: In goblin translator wires
     // come as is, since they have to reflect the structure of polynomials in the first 4 wires, which we've commited to
     for (auto [wire_poly, wire] : zip_view(key->polynomials.get_wires(), circuit_builder.wires)) {
-        run_loop_in_parallel(circuit_builder.num_gates, [&](size_t start, size_t end) {
+        parallel_for_range(circuit_builder.num_gates, [&](size_t start, size_t end) {
             for (size_t i = start; i < end; i++) {
                 wire_poly[i] = circuit_builder.get_variable(wire[i]);
             }