From ad1c73bbad666cbbf3a8c8cbbd35e43450339392 Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Sun, 5 May 2024 22:05:32 +0000
Subject: [PATCH 1/6] Fix failure at cost of regression.

---
 .../src/barretenberg/eccvm/msm_builder.hpp    | 363 +++++++++---------
 .../goblin/goblin_recursion.test.cpp          |   8 +-
 2 files changed, 192 insertions(+), 179 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
index 3be74f357aa..1eff50b35e9 100644
--- a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
@@ -207,146 +207,154 @@ class ECCVMMSMMBuilder {
         // we start the accumulator at the point at infinity
         accumulator_trace[0] = (CycleGroup::affine_point_at_infinity);
 
+        // TODO(https://github.com/AztecProtocol/barretenberg/issues/973): Reinstate multitreading.
+        // For now, only multithread some parts of this builder for circuits that are larger than ones we see or test.
+        static constexpr uint32_t HACK_TO_FIX_VANILLA = uint32_t(1) << 31;
         // populate point trace data, and the components of the MSM execution trace that do not relate to affine point
         // operations
-        run_loop_in_parallel(msms.size(), [&](size_t start, size_t end) {
-            for (size_t i = start; i < end; i++) {
-                Element accumulator = CycleGroup::affine_point_at_infinity;
-                const auto& msm = msms[i];
-                size_t msm_row_index = msm_row_indices[i];
-                const size_t msm_size = msm.size();
-                const size_t rows_per_round =
-                    (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
-                size_t trace_index = (msm_row_indices[i] - 1) * 4;
-
-                for (size_t j = 0; j < num_rounds; ++j) {
-                    const uint32_t pc = static_cast<uint32_t>(pc_indices[i]);
-
-                    for (size_t k = 0; k < rows_per_round; ++k) {
-                        const size_t points_per_row =
-                            (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW;
-                        auto& row = msm_state[msm_row_index];
-                        const size_t idx = k * ADDITIONS_PER_ROW;
-                        row.msm_transition = (j == 0) && (k == 0);
-                        for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
-
-                            auto& add_state = row.add_state[m];
-                            add_state.add = points_per_row > m;
-                            int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0;
-                            // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row.
-                            // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in
-                            // the MSM columns into the MSM accumulator `add_state.slice` = A 4-bit WNAF slice of
-                            // the scalar multiplier associated with the point we are adding (the specific slice
-                            // chosen depends on the value of msm_round) (WNAF = windowed-non-adjacent-form. Value
-                            // range is `-15, -13,
-                            // ..., 15`) If `add_state.add = 1`, we want `add_state.slice` to be the *compressed*
-                            // form of the WNAF slice value. (compressed = no gaps in the value range. i.e. -15,
-                            // -13, ..., 15 maps to 0, ... , 15)
-                            add_state.slice = add_state.add ? (slice + 15) / 2 : 0;
-                            add_state.point = add_state.add
-                                                  ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
-                                                  : AffineElement{ 0, 0 };
+        run_loop_in_parallel(
+            msms.size(),
+            [&](size_t start, size_t end) {
+                for (size_t i = start; i < end; i++) {
+                    Element accumulator = CycleGroup::affine_point_at_infinity;
+                    const auto& msm = msms[i];
+                    size_t msm_row_index = msm_row_indices[i];
+                    const size_t msm_size = msm.size();
+                    const size_t rows_per_round =
+                        (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
+                    size_t trace_index = (msm_row_indices[i] - 1) * 4;
+
+                    for (size_t j = 0; j < num_rounds; ++j) {
+                        const uint32_t pc = static_cast<uint32_t>(pc_indices[i]);
 
-                            // predicate logic:
-                            // add_predicate should normally equal add_state.add
-                            // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point
-                            // addition of a new MSM In this case, we do NOT add the 1st point into the accumulator,
-                            // instead we SET the accumulator to equal the 1st point. add_predicate is used to
-                            // determine whether we add the output of a point addition into the accumulator,
-                            // therefore if j == 0 AND k == 0 AND m == 0, add_predicate = 0 even if add_state.add =
-                            // true
-                            bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
-
-                            Element p1 = (m == 0) ? Element(add_state.point) : accumulator;
-                            Element p2 = (m == 0) ? accumulator : Element(add_state.point);
-
-                            accumulator = add_predicate ? (accumulator + add_state.point) : Element(p1);
-                            p1_trace[trace_index] = p1;
-                            p2_trace[trace_index] = p2;
-                            p3_trace[trace_index] = accumulator;
-                            operation_trace[trace_index] = false;
-                            trace_index++;
-                        }
-                        accumulator_trace[msm_row_index] = accumulator;
-                        row.q_add = true;
-                        row.q_double = false;
-                        row.q_skew = false;
-                        row.msm_round = static_cast<uint32_t>(j);
-                        row.msm_size = static_cast<uint32_t>(msm_size);
-                        row.msm_count = static_cast<uint32_t>(idx);
-                        row.pc = pc;
-                        msm_row_index++;
-                    }
-                    // doubling
-                    if (j < num_rounds - 1) {
-                        auto& row = msm_state[msm_row_index];
-                        row.msm_transition = false;
-                        row.msm_round = static_cast<uint32_t>(j + 1);
-                        row.msm_size = static_cast<uint32_t>(msm_size);
-                        row.msm_count = static_cast<uint32_t>(0);
-                        row.q_add = false;
-                        row.q_double = true;
-                        row.q_skew = false;
-                        for (size_t m = 0; m < 4; ++m) {
-
-                            auto& add_state = row.add_state[m];
-                            add_state.add = false;
-                            add_state.slice = 0;
-                            add_state.point = { 0, 0 };
-                            add_state.collision_inverse = 0;
-
-                            p1_trace[trace_index] = accumulator;
-                            p2_trace[trace_index] = accumulator;
-                            accumulator = accumulator.dbl();
-                            p3_trace[trace_index] = accumulator;
-                            operation_trace[trace_index] = true;
-                            trace_index++;
-                        }
-                        accumulator_trace[msm_row_index] = accumulator;
-                        msm_row_index++;
-                    } else {
                         for (size_t k = 0; k < rows_per_round; ++k) {
-                            auto& row = msm_state[msm_row_index];
-
                             const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size
                                                               ? msm_size % ADDITIONS_PER_ROW
                                                               : ADDITIONS_PER_ROW;
+                            auto& row = msm_state[msm_row_index];
                             const size_t idx = k * ADDITIONS_PER_ROW;
-                            row.msm_transition = false;
-
-                            Element acc_expected = accumulator;
+                            row.msm_transition = (j == 0) && (k == 0);
+                            for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
 
-                            for (size_t m = 0; m < 4; ++m) {
                                 auto& add_state = row.add_state[m];
                                 add_state.add = points_per_row > m;
-                                add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0;
-
+                                int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0;
+                                // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row.
+                                // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in
+                                // the MSM columns into the MSM accumulator `add_state.slice` = A 4-bit WNAF slice of
+                                // the scalar multiplier associated with the point we are adding (the specific slice
+                                // chosen depends on the value of msm_round) (WNAF = windowed-non-adjacent-form. Value
+                                // range is `-15, -13,
+                                // ..., 15`) If `add_state.add = 1`, we want `add_state.slice` to be the *compressed*
+                                // form of the WNAF slice value. (compressed = no gaps in the value range. i.e. -15,
+                                // -13, ..., 15 maps to 0, ... , 15)
+                                add_state.slice = add_state.add ? (slice + 15) / 2 : 0;
                                 add_state.point =
                                     add_state.add ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
                                                   : AffineElement{ 0, 0 };
-                                bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
-                                auto p1 = accumulator;
-                                accumulator = add_predicate ? accumulator + add_state.point : accumulator;
+
+                                // predicate logic:
+                                // add_predicate should normally equal add_state.add
+                                // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point
+                                // addition of a new MSM In this case, we do NOT add the 1st point into the accumulator,
+                                // instead we SET the accumulator to equal the 1st point. add_predicate is used to
+                                // determine whether we add the output of a point addition into the accumulator,
+                                // therefore if j == 0 AND k == 0 AND m == 0, add_predicate = 0 even if add_state.add =
+                                // true
+                                bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
+
+                                Element p1 = (m == 0) ? Element(add_state.point) : accumulator;
+                                Element p2 = (m == 0) ? accumulator : Element(add_state.point);
+
+                                accumulator = add_predicate ? (accumulator + add_state.point) : Element(p1);
                                 p1_trace[trace_index] = p1;
-                                p2_trace[trace_index] = add_state.point;
+                                p2_trace[trace_index] = p2;
                                 p3_trace[trace_index] = accumulator;
                                 operation_trace[trace_index] = false;
                                 trace_index++;
                             }
-                            row.q_add = false;
+                            accumulator_trace[msm_row_index] = accumulator;
+                            row.q_add = true;
                             row.q_double = false;
-                            row.q_skew = true;
-                            row.msm_round = static_cast<uint32_t>(j + 1);
+                            row.q_skew = false;
+                            row.msm_round = static_cast<uint32_t>(j);
                             row.msm_size = static_cast<uint32_t>(msm_size);
                             row.msm_count = static_cast<uint32_t>(idx);
                             row.pc = pc;
+                            msm_row_index++;
+                        }
+                        // doubling
+                        if (j < num_rounds - 1) {
+                            auto& row = msm_state[msm_row_index];
+                            row.msm_transition = false;
+                            row.msm_round = static_cast<uint32_t>(j + 1);
+                            row.msm_size = static_cast<uint32_t>(msm_size);
+                            row.msm_count = static_cast<uint32_t>(0);
+                            row.q_add = false;
+                            row.q_double = true;
+                            row.q_skew = false;
+                            for (size_t m = 0; m < 4; ++m) {
+
+                                auto& add_state = row.add_state[m];
+                                add_state.add = false;
+                                add_state.slice = 0;
+                                add_state.point = { 0, 0 };
+                                add_state.collision_inverse = 0;
+
+                                p1_trace[trace_index] = accumulator;
+                                p2_trace[trace_index] = accumulator;
+                                accumulator = accumulator.dbl();
+                                p3_trace[trace_index] = accumulator;
+                                operation_trace[trace_index] = true;
+                                trace_index++;
+                            }
                             accumulator_trace[msm_row_index] = accumulator;
                             msm_row_index++;
+                        } else {
+                            for (size_t k = 0; k < rows_per_round; ++k) {
+                                auto& row = msm_state[msm_row_index];
+
+                                const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size
+                                                                  ? msm_size % ADDITIONS_PER_ROW
+                                                                  : ADDITIONS_PER_ROW;
+                                const size_t idx = k * ADDITIONS_PER_ROW;
+                                row.msm_transition = false;
+
+                                Element acc_expected = accumulator;
+
+                                for (size_t m = 0; m < 4; ++m) {
+                                    auto& add_state = row.add_state[m];
+                                    add_state.add = points_per_row > m;
+                                    add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0;
+
+                                    add_state.point =
+                                        add_state.add
+                                            ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
+                                            : AffineElement{ 0, 0 };
+                                    bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
+                                    auto p1 = accumulator;
+                                    accumulator = add_predicate ? accumulator + add_state.point : accumulator;
+                                    p1_trace[trace_index] = p1;
+                                    p2_trace[trace_index] = add_state.point;
+                                    p3_trace[trace_index] = accumulator;
+                                    operation_trace[trace_index] = false;
+                                    trace_index++;
+                                }
+                                row.q_add = false;
+                                row.q_double = false;
+                                row.q_skew = true;
+                                row.msm_round = static_cast<uint32_t>(j + 1);
+                                row.msm_size = static_cast<uint32_t>(msm_size);
+                                row.msm_count = static_cast<uint32_t>(idx);
+                                row.pc = pc;
+                                accumulator_trace[msm_row_index] = accumulator;
+                                msm_row_index++;
+                            }
                         }
                     }
                 }
-            }
-        });
+            },
+            HACK_TO_FIX_VANILLA);
 
         // Normalize the points in the point trace
         run_loop_in_parallel(point_trace.size(), [&](size_t start, size_t end) {
@@ -369,67 +377,24 @@ class ECCVMMSMMBuilder {
         // complete the computation of the ECCVM execution trace, by adding the affine intermediate point data
         // i.e. row.accumulator_x, row.accumulator_y, row.add_state[0...3].collision_inverse,
         // row.add_state[0...3].lambda
-        run_loop_in_parallel(msms.size(), [&](size_t start, size_t end) {
-            for (size_t i = start; i < end; i++) {
-                const auto& msm = msms[i];
-                size_t trace_index = ((msm_row_indices[i] - 1) * ADDITIONS_PER_ROW);
-                size_t msm_row_index = msm_row_indices[i];
-                // 1st MSM row will have accumulator equal to the previous MSM output
-                // (or point at infinity for 1st MSM)
-                size_t accumulator_index = msm_row_indices[i] - 1;
-                const size_t msm_size = msm.size();
-                const size_t rows_per_round =
-                    (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
-
-                for (size_t j = 0; j < num_rounds; ++j) {
-                    for (size_t k = 0; k < rows_per_round; ++k) {
-                        auto& row = msm_state[msm_row_index];
-                        const Element& normalized_accumulator = accumulator_trace[accumulator_index];
-                        const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
-                        const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
-                        row.accumulator_x = acc_x;
-                        row.accumulator_y = acc_y;
-
-                        for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
-                            auto& add_state = row.add_state[m];
-                            bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
-
-                            const auto& inverse = inverse_trace[trace_index];
-                            const auto& p1 = p1_trace[trace_index];
-                            const auto& p2 = p2_trace[trace_index];
-                            add_state.collision_inverse = add_predicate ? inverse : 0;
-                            add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
-                            trace_index++;
-                        }
-                        accumulator_index++;
-                        msm_row_index++;
-                    }
-
-                    if (j < num_rounds - 1) {
-                        MSMState& row = msm_state[msm_row_index];
-                        const Element& normalized_accumulator = accumulator_trace[accumulator_index];
-                        const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
-                        const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
-                        row.accumulator_x = acc_x;
-                        row.accumulator_y = acc_y;
-
-                        for (size_t m = 0; m < 4; ++m) {
-                            auto& add_state = row.add_state[m];
-                            add_state.collision_inverse = 0;
-                            const FF& dx = p1_trace[trace_index].x;
-                            const FF& inverse = inverse_trace[trace_index];
-                            add_state.lambda = ((dx + dx + dx) * dx) * inverse;
-                            trace_index++;
-                        }
-                        accumulator_index++;
-                        msm_row_index++;
-                    } else {
+        run_loop_in_parallel(
+            msms.size(),
+            [&](size_t start, size_t end) {
+                for (size_t i = start; i < end; i++) {
+                    const auto& msm = msms[i];
+                    size_t trace_index = ((msm_row_indices[i] - 1) * ADDITIONS_PER_ROW);
+                    size_t msm_row_index = msm_row_indices[i];
+                    // 1st MSM row will have accumulator equal to the previous MSM output
+                    // (or point at infinity for 1st MSM)
+                    size_t accumulator_index = msm_row_indices[i] - 1;
+                    const size_t msm_size = msm.size();
+                    const size_t rows_per_round =
+                        (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
+
+                    for (size_t j = 0; j < num_rounds; ++j) {
                         for (size_t k = 0; k < rows_per_round; ++k) {
-                            MSMState& row = msm_state[msm_row_index];
+                            auto& row = msm_state[msm_row_index];
                             const Element& normalized_accumulator = accumulator_trace[accumulator_index];
-
-                            const size_t idx = k * ADDITIONS_PER_ROW;
-
                             const FF& acc_x =
                                 normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
                             const FF& acc_y =
@@ -439,7 +404,7 @@ class ECCVMMSMMBuilder {
 
                             for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
                                 auto& add_state = row.add_state[m];
-                                bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
+                                bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
 
                                 const auto& inverse = inverse_trace[trace_index];
                                 const auto& p1 = p1_trace[trace_index];
@@ -451,10 +416,60 @@ class ECCVMMSMMBuilder {
                             accumulator_index++;
                             msm_row_index++;
                         }
+
+                        if (j < num_rounds - 1) {
+                            MSMState& row = msm_state[msm_row_index];
+                            const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+                            const FF& acc_x =
+                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                            const FF& acc_y =
+                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                            row.accumulator_x = acc_x;
+                            row.accumulator_y = acc_y;
+
+                            for (size_t m = 0; m < 4; ++m) {
+                                auto& add_state = row.add_state[m];
+                                add_state.collision_inverse = 0;
+                                const FF& dx = p1_trace[trace_index].x;
+                                const FF& inverse = inverse_trace[trace_index];
+                                add_state.lambda = ((dx + dx + dx) * dx) * inverse;
+                                trace_index++;
+                            }
+                            accumulator_index++;
+                            msm_row_index++;
+                        } else {
+                            for (size_t k = 0; k < rows_per_round; ++k) {
+                                MSMState& row = msm_state[msm_row_index];
+                                const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+
+                                const size_t idx = k * ADDITIONS_PER_ROW;
+
+                                const FF& acc_x =
+                                    normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                                const FF& acc_y =
+                                    normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                                row.accumulator_x = acc_x;
+                                row.accumulator_y = acc_y;
+
+                                for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
+                                    auto& add_state = row.add_state[m];
+                                    bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
+
+                                    const auto& inverse = inverse_trace[trace_index];
+                                    const auto& p1 = p1_trace[trace_index];
+                                    const auto& p2 = p2_trace[trace_index];
+                                    add_state.collision_inverse = add_predicate ? inverse : 0;
+                                    add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
+                                    trace_index++;
+                                }
+                                accumulator_index++;
+                                msm_row_index++;
+                            }
+                        }
                     }
                 }
-            }
-        });
+            },
+            HACK_TO_FIX_VANILLA);
 
         // populate the final row in the MSM execution trace.
         // we always require 1 extra row at the end of the trace, because the accumulator x/y coordinates for row `i`
diff --git a/barretenberg/cpp/src/barretenberg/goblin/goblin_recursion.test.cpp b/barretenberg/cpp/src/barretenberg/goblin/goblin_recursion.test.cpp
index 43280a800ed..814dfecd9c3 100644
--- a/barretenberg/cpp/src/barretenberg/goblin/goblin_recursion.test.cpp
+++ b/barretenberg/cpp/src/barretenberg/goblin/goblin_recursion.test.cpp
@@ -33,12 +33,10 @@ class GoblinRecursionTests : public ::testing::Test {
 };
 
 /**
- * @brief A full Goblin test that mimicks the basic aztec client architecture
- * @details
+ * @brief Test illustrating a Goblin-based IVC scheme
+ * @details Goblin is usd to accumulate recursive verifications of the GoblinUltraHonk proving system.
  */
-// TODO fix with https://github.com/AztecProtocol/barretenberg/issues/930
-// intermittent failures, presumably due to uninitialized memory
-TEST_F(GoblinRecursionTests, DISABLED_Vanilla)
+TEST_F(GoblinRecursionTests, Vanilla)
 {
     Goblin goblin;
 

From b4e40dab487e274565c134ad86aaa4c60a72d89f Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Mon, 6 May 2024 14:19:08 +0000
Subject: [PATCH 2/6] Bring benchmark comparison script in line with others.

---
 .../compare_branch_vs_baseline.sh             | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)
 rename barretenberg/cpp/{src/barretenberg/benchmark => scripts}/compare_branch_vs_baseline.sh (74%)

diff --git a/barretenberg/cpp/src/barretenberg/benchmark/compare_branch_vs_baseline.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline.sh
similarity index 74%
rename from barretenberg/cpp/src/barretenberg/benchmark/compare_branch_vs_baseline.sh
rename to barretenberg/cpp/scripts/compare_branch_vs_baseline.sh
index 34ee2ce171d..acd11d0139a 100755
--- a/barretenberg/cpp/src/barretenberg/benchmark/compare_branch_vs_baseline.sh
+++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline.sh
@@ -5,10 +5,12 @@
 # it is up to date with local master, and run the script.
 
 # Specify the benchmark suite and the "baseline" branch against which to compare
-BENCH_TARGET=${1:?"Please provide the name of a benchmark target."}
+BENCHMARK=${1:?"Please provide the name of a benchmark target."}
+COMMAND=${2:-./$BENCHMARK}
 BASELINE_BRANCH="master"
 
-echo -e "\nComparing $BENCH_TARGET between $BASELINE_BRANCH and current branch:"
+
+echo -e "\nComparing $BENCHMARK between $BASELINE_BRANCH and current branch:"
 # Set some directories
 BASE_DIR="$HOME/aztec-packages/barretenberg/cpp"
 BUILD_DIR="$BASE_DIR/build-bench" # matches build dir specified in bench preset
@@ -24,24 +26,24 @@ cd $BASE_DIR
 mkdir $BENCH_RESULTS_DIR
 
 # Build and run bench in current branch
-echo -e "\nConfiguring and building $BENCH_TARGET in current feature branch..\n"
-rm -rf $BUILD_DIR
-cmake --preset bench > /dev/null && cmake --build --preset bench --target $BENCH_TARGET
+echo -e "\nConfiguring and building $BENCHMARK in current feature branch..\n"
+# rm -rf $BUILD_DIR
+cmake --preset bench > /dev/null && cmake --build --preset bench --target $BENCHMARK
 cd build-bench
 BRANCH_RESULTS="$BENCH_RESULTS_DIR/results_branch.json"
-echo -e "\nRunning $BENCH_TARGET in feature branch.."
-bin/$BENCH_TARGET --benchmark_format=json > $BRANCH_RESULTS
+echo -e "\nRunning $COMMAND in feature branch.."
+bin/$COMMAND --benchmark_format=json > $BRANCH_RESULTS
 
 # Checkout baseline branch, run benchmarks, save results in json format
-echo -e "\nConfiguring and building $BENCH_TARGET in $BASELINE_BRANCH branch..\n"
+echo -e "\nConfiguring and building $BENCHMARK in $BASELINE_BRANCH branch..\n"
 git checkout master > /dev/null
 cd $BASE_DIR
 rm -rf $BUILD_DIR
-cmake --preset bench > /dev/null && cmake --build --preset bench --target $BENCH_TARGET
+cmake --preset bench > /dev/null && cmake --build --preset bench --target $BENCHMARK
 cd build-bench
 BASELINE_RESULTS="$BENCH_RESULTS_DIR/results_baseline.json"
-echo -e "\nRunning $BENCH_TARGET in master.."
-bin/$BENCH_TARGET --benchmark_format=json > $BASELINE_RESULTS
+echo -e "\nRunning $COMMAND in master.."
+bin/$COMMAND --benchmark_format=json > $BASELINE_RESULTS
 
 # Call compare.py on the results (json) to get high level statistics.
 # See docs at https://github.com/google/benchmark/blob/main/docs/tools.md for more details.

From 0f9f29112d386945c1c2f370fc6198cac11e2dcc Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Mon, 6 May 2024 16:36:31 +0000
Subject: [PATCH 3/6] Add update compare script to use remote instance and
 align other

---
 .../cpp/scripts/compare_branch_vs_baseline.sh | 85 ++++++++++---------
 .../compare_branch_vs_baseline_remote.sh      | 61 +++++++++++++
 2 files changed, 106 insertions(+), 40 deletions(-)
 create mode 100755 barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh

diff --git a/barretenberg/cpp/scripts/compare_branch_vs_baseline.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline.sh
index acd11d0139a..f798893781e 100755
--- a/barretenberg/cpp/scripts/compare_branch_vs_baseline.sh
+++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline.sh
@@ -1,56 +1,61 @@
 #!/usr/bin/env bash
 
+# Install requirements (numpy + scipy) for comparison script if necessary.
+# Note: By default, installation will occur in $HOME/.local/bin.
+# pip3 install --user -r $BUILD_DIR/_deps/benchmark-src/requirements.txt
+
+
 # This script is used to compare a suite of benchmarks between baseline (default: master) and
 # the branch from which the script is run. Simply check out the branch of interest, ensure
 # it is up to date with local master, and run the script.
 
 # Specify the benchmark suite and the "baseline" branch against which to compare
-BENCHMARK=${1:?"Please provide the name of a benchmark target."}
-COMMAND=${2:-./$BENCHMARK}
-BASELINE_BRANCH="master"
+BENCHMARK=${1:-goblin_bench}
+FILTER=${2:-""}
+PRESET=${3:-clang16}
+BUILD_DIR=${4:-build}
+HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
 
+BASELINE_BRANCH="master"
+BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools"
 
 echo -e "\nComparing $BENCHMARK between $BASELINE_BRANCH and current branch:"
-# Set some directories
-BASE_DIR="$HOME/aztec-packages/barretenberg/cpp"
-BUILD_DIR="$BASE_DIR/build-bench" # matches build dir specified in bench preset
-BENCH_RESULTS_DIR="$BASE_DIR/tmp_bench_results"
-BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools"
 
-# Install requirements (numpy + scipy) for comparison script if necessary.
-# Note: By default, installation will occur in $HOME/.local/bin.
-pip3 install --user -r $BUILD_DIR/_deps/benchmark-src/requirements.txt
-
-# Create temporary directory for benchmark results (json)
-cd $BASE_DIR
-mkdir $BENCH_RESULTS_DIR
-
-# Build and run bench in current branch
-echo -e "\nConfiguring and building $BENCHMARK in current feature branch..\n"
-# rm -rf $BUILD_DIR
-cmake --preset bench > /dev/null && cmake --build --preset bench --target $BENCHMARK
-cd build-bench
-BRANCH_RESULTS="$BENCH_RESULTS_DIR/results_branch.json"
-echo -e "\nRunning $COMMAND in feature branch.."
-bin/$COMMAND --benchmark_format=json > $BRANCH_RESULTS
-
-# Checkout baseline branch, run benchmarks, save results in json format
-echo -e "\nConfiguring and building $BENCHMARK in $BASELINE_BRANCH branch..\n"
-git checkout master > /dev/null
-cd $BASE_DIR
-rm -rf $BUILD_DIR
-cmake --preset bench > /dev/null && cmake --build --preset bench --target $BENCHMARK
-cd build-bench
-BASELINE_RESULTS="$BENCH_RESULTS_DIR/results_baseline.json"
-echo -e "\nRunning $COMMAND in master.."
-bin/$COMMAND --benchmark_format=json > $BASELINE_RESULTS
+# Move above script dir.
+cd $(dirname $0)/..
+
+# Configure and build benchmark in feature branch
+echo -e "\nConfiguring and building $BENCHMARK in current feature branch...\n"
+cmake --preset $PRESET
+cmake --build --preset $PRESET --target $BENCHMARK
+
+# Run bench in current branch
+echo -e "\nRunning benchmark in feature branch.."
+./scripts/benchmark.sh $BENCHMARK\
+                        "./$BENCHMARK --benchmark_filter=$FILTER\
+                                      --benchmark_out=results_after.json\
+                                      --benchmark_out_format=json"\
+                        $PRESET
+                        $BUILD_DIR
+
+# Configure and build benchmark in $BASELINE branch
+echo -e "\nConfiguring and building $BENCHMARK in $BASELINE_BRANCH...\n"
+git checkout $BASELINE_BRANCH
+cmake --preset $PRESET
+cmake --build --preset $PRESET --target $BENCHMARK
+
+# Run bench in current branch
+echo -e "\nRunning benchmark in feature branch.."
+./scripts/benchmark.sh $BENCHMARK\
+                        "./$BENCHMARK --benchmark_filter=$FILTER\
+                                      --benchmark_out=results_before.json\
+                                      --benchmark_out_format=json"\
+                        $PRESET
+                        $BUILD_DIR
 
 # Call compare.py on the results (json) to get high level statistics.
 # See docs at https://github.com/google/benchmark/blob/main/docs/tools.md for more details.
-$BENCH_TOOLS_DIR/compare.py benchmarks $BASELINE_RESULTS $BRANCH_RESULTS
+$BENCH_TOOLS_DIR/compare.py benchmarks $BUILD_DIR/results_before.json $BUILD_DIR/results_after.json
 
 # Return to branch from which the script was called
-git checkout -
-
-# Delete the temporary results directory and its contents
-rm -r $BENCH_RESULTS_DIR
+git checkout -
\ No newline at end of file
diff --git a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
new file mode 100755
index 00000000000..4211dfb4a76
--- /dev/null
+++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Install requirements (numpy + scipy) for comparison script if necessary.
+# Note: By default, installation will occur in $HOME/.local/bin.
+# pip3 install --user -r $BUILD_DIR/_deps/benchmark-src/requirements.txt
+
+
+# This script is used to compare a suite of benchmarks between baseline (default: master) and
+# the branch from which the script is run. Simply check out the branch of interest, ensure
+# it is up to date with local master, and run the script.
+
+# Specify the benchmark suite and the "baseline" branch against which to compare
+BENCHMARK=${1:-goblin_bench}
+FILTER=${2:-""}
+PRESET=${3:-clang16}
+BUILD_DIR=${4:-build}
+HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
+
+BASELINE_BRANCH="master"
+BENCH_TOOLS_DIR="$BUILD_DIR/_deps/benchmark-src/tools"
+
+echo -e "\nComparing $BENCHMARK between $BASELINE_BRANCH and current branch:"
+
+# Move above script dir.
+cd $(dirname $0)/..
+
+# Configure and build benchmark in feature branch
+echo -e "\nConfiguring and building $BENCHMARK in current feature branch...\n"
+cmake --preset $PRESET
+cmake --build --preset $PRESET --target $BENCHMARK
+
+# Run bench in current branch
+echo -e "\nRunning benchmark in feature branch.."
+./scripts/benchmark_remote.sh $BENCHMARK\
+                              "./$BENCHMARK --benchmark_filter=$FILTER\
+                                            --benchmark_out=results_after.json\
+                                            --benchmark_out_format=json"\
+                              $PRESET
+                              $BUILD_DIR
+
+# Configure and build benchmark in $BASELINE branch
+echo -e "\nConfiguring and building $BENCHMARK in $BASELINE_BRANCH...\n"
+git checkout $BASELINE_BRANCH
+cmake --preset $PRESET
+cmake --build --preset $PRESET --target $BENCHMARK
+
+zs# Run bench in current branch
+echo -e "\nRunning benchmark in feature branch.."
+./scripts/benchmark_remote.sh $BENCHMARK\
+                              "./$BENCHMARK --benchmark_filter=$FILTER\
+                                            --benchmark_out=results_before.json\
+                                            --benchmark_out_format=json"\
+                              $PRESET
+                              $BUILD_DIR
+
+# Call compare.py on the results (json) to get high level statistics.
+# See docs at https://github.com/google/benchmark/blob/main/docs/tools.md for more details.
+$BENCH_TOOLS_DIR/compare.py benchmarks $BUILD_DIR/results_before.json $BUILD_DIR/results_after.json
+
+# Return to branch from which the script was called
+git checkout -
\ No newline at end of file

From 75ec2fad6bb7abbb8fc66262fd59a9a092aa2dca Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Mon, 6 May 2024 17:30:06 +0000
Subject: [PATCH 4/6] Reinstate  scp lines

---
 barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
index 4211dfb4a76..d28cc0a4972 100755
--- a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
+++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
@@ -37,6 +37,7 @@ echo -e "\nRunning benchmark in feature branch.."
                                             --benchmark_out_format=json"\
                               $PRESET
                               $BUILD_DIR
+scp $BB_SSH_KEY $BB_SSH_INSTANCE:$BB_SSH_CPP_PATH/build/results_after.json $BUILD_DIR/
 
 # Configure and build benchmark in $BASELINE branch
 echo -e "\nConfiguring and building $BENCHMARK in $BASELINE_BRANCH...\n"
@@ -44,7 +45,7 @@ git checkout $BASELINE_BRANCH
 cmake --preset $PRESET
 cmake --build --preset $PRESET --target $BENCHMARK
 
-zs# Run bench in current branch
+# Run bench in current branch
 echo -e "\nRunning benchmark in feature branch.."
 ./scripts/benchmark_remote.sh $BENCHMARK\
                               "./$BENCHMARK --benchmark_filter=$FILTER\
@@ -52,6 +53,7 @@ echo -e "\nRunning benchmark in feature branch.."
                                             --benchmark_out_format=json"\
                               $PRESET
                               $BUILD_DIR
+scp $BB_SSH_KEY $BB_SSH_INSTANCE:$BB_SSH_CPP_PATH/build/results_before.json $BUILD_DIR/
 
 # Call compare.py on the results (json) to get high level statistics.
 # See docs at https://github.com/google/benchmark/blob/main/docs/tools.md for more details.

From dd4418879afe24e63bf2d02ec19f896ce0f746b3 Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Mon, 6 May 2024 17:31:08 +0000
Subject: [PATCH 5/6] Just don't multithread the MSM builder

---
 .../src/barretenberg/eccvm/msm_builder.hpp    | 439 +++++++++---------
 1 file changed, 208 insertions(+), 231 deletions(-)

diff --git a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
index 1eff50b35e9..5572bab54ee 100644
--- a/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
+++ b/barretenberg/cpp/src/barretenberg/eccvm/msm_builder.hpp
@@ -207,154 +207,143 @@ class ECCVMMSMMBuilder {
         // we start the accumulator at the point at infinity
         accumulator_trace[0] = (CycleGroup::affine_point_at_infinity);
 
-        // TODO(https://github.com/AztecProtocol/barretenberg/issues/973): Reinstate multitreading.
-        // For now, only multithread some parts of this builder for circuits that are larger than ones we see or test.
-        static constexpr uint32_t HACK_TO_FIX_VANILLA = uint32_t(1) << 31;
+        // TODO(https://github.com/AztecProtocol/barretenberg/issues/973): Reinstate multitreading?
         // populate point trace data, and the components of the MSM execution trace that do not relate to affine point
         // operations
-        run_loop_in_parallel(
-            msms.size(),
-            [&](size_t start, size_t end) {
-                for (size_t i = start; i < end; i++) {
-                    Element accumulator = CycleGroup::affine_point_at_infinity;
-                    const auto& msm = msms[i];
-                    size_t msm_row_index = msm_row_indices[i];
-                    const size_t msm_size = msm.size();
-                    const size_t rows_per_round =
-                        (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
-                    size_t trace_index = (msm_row_indices[i] - 1) * 4;
-
-                    for (size_t j = 0; j < num_rounds; ++j) {
-                        const uint32_t pc = static_cast<uint32_t>(pc_indices[i]);
-
-                        for (size_t k = 0; k < rows_per_round; ++k) {
-                            const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size
-                                                              ? msm_size % ADDITIONS_PER_ROW
-                                                              : ADDITIONS_PER_ROW;
-                            auto& row = msm_state[msm_row_index];
-                            const size_t idx = k * ADDITIONS_PER_ROW;
-                            row.msm_transition = (j == 0) && (k == 0);
-                            for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
-
-                                auto& add_state = row.add_state[m];
-                                add_state.add = points_per_row > m;
-                                int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0;
-                                // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row.
-                                // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in
-                                // the MSM columns into the MSM accumulator `add_state.slice` = A 4-bit WNAF slice of
-                                // the scalar multiplier associated with the point we are adding (the specific slice
-                                // chosen depends on the value of msm_round) (WNAF = windowed-non-adjacent-form. Value
-                                // range is `-15, -13,
-                                // ..., 15`) If `add_state.add = 1`, we want `add_state.slice` to be the *compressed*
-                                // form of the WNAF slice value. (compressed = no gaps in the value range. i.e. -15,
-                                // -13, ..., 15 maps to 0, ... , 15)
-                                add_state.slice = add_state.add ? (slice + 15) / 2 : 0;
-                                add_state.point =
-                                    add_state.add ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
-                                                  : AffineElement{ 0, 0 };
+        for (size_t i = 0; i < msms.size(); i++) {
+            Element accumulator = CycleGroup::affine_point_at_infinity;
+            const auto& msm = msms[i];
+            size_t msm_row_index = msm_row_indices[i];
+            const size_t msm_size = msm.size();
+            const size_t rows_per_round = (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
+            size_t trace_index = (msm_row_indices[i] - 1) * 4;
 
-                                // predicate logic:
-                                // add_predicate should normally equal add_state.add
-                                // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point
-                                // addition of a new MSM In this case, we do NOT add the 1st point into the accumulator,
-                                // instead we SET the accumulator to equal the 1st point. add_predicate is used to
-                                // determine whether we add the output of a point addition into the accumulator,
-                                // therefore if j == 0 AND k == 0 AND m == 0, add_predicate = 0 even if add_state.add =
-                                // true
-                                bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
-
-                                Element p1 = (m == 0) ? Element(add_state.point) : accumulator;
-                                Element p2 = (m == 0) ? accumulator : Element(add_state.point);
-
-                                accumulator = add_predicate ? (accumulator + add_state.point) : Element(p1);
-                                p1_trace[trace_index] = p1;
-                                p2_trace[trace_index] = p2;
-                                p3_trace[trace_index] = accumulator;
-                                operation_trace[trace_index] = false;
-                                trace_index++;
-                            }
-                            accumulator_trace[msm_row_index] = accumulator;
-                            row.q_add = true;
-                            row.q_double = false;
-                            row.q_skew = false;
-                            row.msm_round = static_cast<uint32_t>(j);
-                            row.msm_size = static_cast<uint32_t>(msm_size);
-                            row.msm_count = static_cast<uint32_t>(idx);
-                            row.pc = pc;
-                            msm_row_index++;
-                        }
-                        // doubling
-                        if (j < num_rounds - 1) {
-                            auto& row = msm_state[msm_row_index];
-                            row.msm_transition = false;
-                            row.msm_round = static_cast<uint32_t>(j + 1);
-                            row.msm_size = static_cast<uint32_t>(msm_size);
-                            row.msm_count = static_cast<uint32_t>(0);
-                            row.q_add = false;
-                            row.q_double = true;
-                            row.q_skew = false;
-                            for (size_t m = 0; m < 4; ++m) {
-
-                                auto& add_state = row.add_state[m];
-                                add_state.add = false;
-                                add_state.slice = 0;
-                                add_state.point = { 0, 0 };
-                                add_state.collision_inverse = 0;
-
-                                p1_trace[trace_index] = accumulator;
-                                p2_trace[trace_index] = accumulator;
-                                accumulator = accumulator.dbl();
-                                p3_trace[trace_index] = accumulator;
-                                operation_trace[trace_index] = true;
-                                trace_index++;
-                            }
-                            accumulator_trace[msm_row_index] = accumulator;
-                            msm_row_index++;
-                        } else {
-                            for (size_t k = 0; k < rows_per_round; ++k) {
-                                auto& row = msm_state[msm_row_index];
-
-                                const size_t points_per_row = (k + 1) * ADDITIONS_PER_ROW > msm_size
-                                                                  ? msm_size % ADDITIONS_PER_ROW
-                                                                  : ADDITIONS_PER_ROW;
-                                const size_t idx = k * ADDITIONS_PER_ROW;
-                                row.msm_transition = false;
-
-                                Element acc_expected = accumulator;
-
-                                for (size_t m = 0; m < 4; ++m) {
-                                    auto& add_state = row.add_state[m];
-                                    add_state.add = points_per_row > m;
-                                    add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0;
-
-                                    add_state.point =
-                                        add_state.add
-                                            ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
-                                            : AffineElement{ 0, 0 };
-                                    bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
-                                    auto p1 = accumulator;
-                                    accumulator = add_predicate ? accumulator + add_state.point : accumulator;
-                                    p1_trace[trace_index] = p1;
-                                    p2_trace[trace_index] = add_state.point;
-                                    p3_trace[trace_index] = accumulator;
-                                    operation_trace[trace_index] = false;
-                                    trace_index++;
-                                }
-                                row.q_add = false;
-                                row.q_double = false;
-                                row.q_skew = true;
-                                row.msm_round = static_cast<uint32_t>(j + 1);
-                                row.msm_size = static_cast<uint32_t>(msm_size);
-                                row.msm_count = static_cast<uint32_t>(idx);
-                                row.pc = pc;
-                                accumulator_trace[msm_row_index] = accumulator;
-                                msm_row_index++;
-                            }
+            for (size_t j = 0; j < num_rounds; ++j) {
+                const uint32_t pc = static_cast<uint32_t>(pc_indices[i]);
+
+                for (size_t k = 0; k < rows_per_round; ++k) {
+                    const size_t points_per_row =
+                        (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW;
+                    auto& row = msm_state[msm_row_index];
+                    const size_t idx = k * ADDITIONS_PER_ROW;
+                    row.msm_transition = (j == 0) && (k == 0);
+                    for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
+
+                        auto& add_state = row.add_state[m];
+                        add_state.add = points_per_row > m;
+                        int slice = add_state.add ? msm[idx + m].wnaf_slices[j] : 0;
+                        // In the MSM columns in the ECCVM circuit, we can add up to 4 points per row.
+                        // if `row.add_state[m].add = 1`, this indicates that we want to add the `m`'th point in
+                        // the MSM columns into the MSM accumulator `add_state.slice` = A 4-bit WNAF slice of
+                        // the scalar multiplier associated with the point we are adding (the specific slice
+                        // chosen depends on the value of msm_round) (WNAF = windowed-non-adjacent-form. Value
+                        // range is `-15, -13,
+                        // ..., 15`) If `add_state.add = 1`, we want `add_state.slice` to be the *compressed*
+                        // form of the WNAF slice value. (compressed = no gaps in the value range. i.e. -15,
+                        // -13, ..., 15 maps to 0, ... , 15)
+                        add_state.slice = add_state.add ? (slice + 15) / 2 : 0;
+                        add_state.point = add_state.add
+                                              ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
+                                              : AffineElement{ 0, 0 };
+
+                        // predicate logic:
+                        // add_predicate should normally equal add_state.add
+                        // However! if j == 0 AND k == 0 AND m == 0 this implies we are examing the 1st point
+                        // addition of a new MSM In this case, we do NOT add the 1st point into the accumulator,
+                        // instead we SET the accumulator to equal the 1st point. add_predicate is used to
+                        // determine whether we add the output of a point addition into the accumulator,
+                        // therefore if j == 0 AND k == 0 AND m == 0, add_predicate = 0 even if add_state.add =
+                        // true
+                        bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
+
+                        Element p1 = (m == 0) ? Element(add_state.point) : accumulator;
+                        Element p2 = (m == 0) ? accumulator : Element(add_state.point);
+
+                        accumulator = add_predicate ? (accumulator + add_state.point) : Element(p1);
+                        p1_trace[trace_index] = p1;
+                        p2_trace[trace_index] = p2;
+                        p3_trace[trace_index] = accumulator;
+                        operation_trace[trace_index] = false;
+                        trace_index++;
+                    }
+                    accumulator_trace[msm_row_index] = accumulator;
+                    row.q_add = true;
+                    row.q_double = false;
+                    row.q_skew = false;
+                    row.msm_round = static_cast<uint32_t>(j);
+                    row.msm_size = static_cast<uint32_t>(msm_size);
+                    row.msm_count = static_cast<uint32_t>(idx);
+                    row.pc = pc;
+                    msm_row_index++;
+                }
+                // doubling
+                if (j < num_rounds - 1) {
+                    auto& row = msm_state[msm_row_index];
+                    row.msm_transition = false;
+                    row.msm_round = static_cast<uint32_t>(j + 1);
+                    row.msm_size = static_cast<uint32_t>(msm_size);
+                    row.msm_count = static_cast<uint32_t>(0);
+                    row.q_add = false;
+                    row.q_double = true;
+                    row.q_skew = false;
+                    for (size_t m = 0; m < 4; ++m) {
+
+                        auto& add_state = row.add_state[m];
+                        add_state.add = false;
+                        add_state.slice = 0;
+                        add_state.point = { 0, 0 };
+                        add_state.collision_inverse = 0;
+
+                        p1_trace[trace_index] = accumulator;
+                        p2_trace[trace_index] = accumulator;
+                        accumulator = accumulator.dbl();
+                        p3_trace[trace_index] = accumulator;
+                        operation_trace[trace_index] = true;
+                        trace_index++;
+                    }
+                    accumulator_trace[msm_row_index] = accumulator;
+                    msm_row_index++;
+                } else {
+                    for (size_t k = 0; k < rows_per_round; ++k) {
+                        auto& row = msm_state[msm_row_index];
+
+                        const size_t points_per_row =
+                            (k + 1) * ADDITIONS_PER_ROW > msm_size ? msm_size % ADDITIONS_PER_ROW : ADDITIONS_PER_ROW;
+                        const size_t idx = k * ADDITIONS_PER_ROW;
+                        row.msm_transition = false;
+
+                        Element acc_expected = accumulator;
+
+                        for (size_t m = 0; m < 4; ++m) {
+                            auto& add_state = row.add_state[m];
+                            add_state.add = points_per_row > m;
+                            add_state.slice = add_state.add ? msm[idx + m].wnaf_skew ? 7 : 0 : 0;
+
+                            add_state.point = add_state.add
+                                                  ? msm[idx + m].precomputed_table[static_cast<size_t>(add_state.slice)]
+                                                  : AffineElement{ 0, 0 };
+                            bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
+                            auto p1 = accumulator;
+                            accumulator = add_predicate ? accumulator + add_state.point : accumulator;
+                            p1_trace[trace_index] = p1;
+                            p2_trace[trace_index] = add_state.point;
+                            p3_trace[trace_index] = accumulator;
+                            operation_trace[trace_index] = false;
+                            trace_index++;
                         }
+                        row.q_add = false;
+                        row.q_double = false;
+                        row.q_skew = true;
+                        row.msm_round = static_cast<uint32_t>(j + 1);
+                        row.msm_size = static_cast<uint32_t>(msm_size);
+                        row.msm_count = static_cast<uint32_t>(idx);
+                        row.pc = pc;
+                        accumulator_trace[msm_row_index] = accumulator;
+                        msm_row_index++;
                     }
                 }
-            },
-            HACK_TO_FIX_VANILLA);
+            }
+        }
 
         // Normalize the points in the point trace
         run_loop_in_parallel(point_trace.size(), [&](size_t start, size_t end) {
@@ -377,99 +366,87 @@ class ECCVMMSMMBuilder {
         // complete the computation of the ECCVM execution trace, by adding the affine intermediate point data
         // i.e. row.accumulator_x, row.accumulator_y, row.add_state[0...3].collision_inverse,
         // row.add_state[0...3].lambda
-        run_loop_in_parallel(
-            msms.size(),
-            [&](size_t start, size_t end) {
-                for (size_t i = start; i < end; i++) {
-                    const auto& msm = msms[i];
-                    size_t trace_index = ((msm_row_indices[i] - 1) * ADDITIONS_PER_ROW);
-                    size_t msm_row_index = msm_row_indices[i];
-                    // 1st MSM row will have accumulator equal to the previous MSM output
-                    // (or point at infinity for 1st MSM)
-                    size_t accumulator_index = msm_row_indices[i] - 1;
-                    const size_t msm_size = msm.size();
-                    const size_t rows_per_round =
-                        (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
-
-                    for (size_t j = 0; j < num_rounds; ++j) {
-                        for (size_t k = 0; k < rows_per_round; ++k) {
-                            auto& row = msm_state[msm_row_index];
-                            const Element& normalized_accumulator = accumulator_trace[accumulator_index];
-                            const FF& acc_x =
-                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
-                            const FF& acc_y =
-                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
-                            row.accumulator_x = acc_x;
-                            row.accumulator_y = acc_y;
-
-                            for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
-                                auto& add_state = row.add_state[m];
-                                bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
-
-                                const auto& inverse = inverse_trace[trace_index];
-                                const auto& p1 = p1_trace[trace_index];
-                                const auto& p2 = p2_trace[trace_index];
-                                add_state.collision_inverse = add_predicate ? inverse : 0;
-                                add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
-                                trace_index++;
-                            }
-                            accumulator_index++;
-                            msm_row_index++;
-                        }
+        for (size_t i = 0; i < msms.size(); i++) {
+            const auto& msm = msms[i];
+            size_t trace_index = ((msm_row_indices[i] - 1) * ADDITIONS_PER_ROW);
+            size_t msm_row_index = msm_row_indices[i];
+            // 1st MSM row will have accumulator equal to the previous MSM output
+            // (or point at infinity for 1st MSM)
+            size_t accumulator_index = msm_row_indices[i] - 1;
+            const size_t msm_size = msm.size();
+            const size_t rows_per_round = (msm_size / ADDITIONS_PER_ROW) + (msm_size % ADDITIONS_PER_ROW != 0 ? 1 : 0);
 
-                        if (j < num_rounds - 1) {
-                            MSMState& row = msm_state[msm_row_index];
-                            const Element& normalized_accumulator = accumulator_trace[accumulator_index];
-                            const FF& acc_x =
-                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
-                            const FF& acc_y =
-                                normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
-                            row.accumulator_x = acc_x;
-                            row.accumulator_y = acc_y;
-
-                            for (size_t m = 0; m < 4; ++m) {
-                                auto& add_state = row.add_state[m];
-                                add_state.collision_inverse = 0;
-                                const FF& dx = p1_trace[trace_index].x;
-                                const FF& inverse = inverse_trace[trace_index];
-                                add_state.lambda = ((dx + dx + dx) * dx) * inverse;
-                                trace_index++;
-                            }
-                            accumulator_index++;
-                            msm_row_index++;
-                        } else {
-                            for (size_t k = 0; k < rows_per_round; ++k) {
-                                MSMState& row = msm_state[msm_row_index];
-                                const Element& normalized_accumulator = accumulator_trace[accumulator_index];
-
-                                const size_t idx = k * ADDITIONS_PER_ROW;
-
-                                const FF& acc_x =
-                                    normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
-                                const FF& acc_y =
-                                    normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
-                                row.accumulator_x = acc_x;
-                                row.accumulator_y = acc_y;
-
-                                for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
-                                    auto& add_state = row.add_state[m];
-                                    bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
-
-                                    const auto& inverse = inverse_trace[trace_index];
-                                    const auto& p1 = p1_trace[trace_index];
-                                    const auto& p2 = p2_trace[trace_index];
-                                    add_state.collision_inverse = add_predicate ? inverse : 0;
-                                    add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
-                                    trace_index++;
-                                }
-                                accumulator_index++;
-                                msm_row_index++;
-                            }
+            for (size_t j = 0; j < num_rounds; ++j) {
+                for (size_t k = 0; k < rows_per_round; ++k) {
+                    auto& row = msm_state[msm_row_index];
+                    const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+                    const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                    const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                    row.accumulator_x = acc_x;
+                    row.accumulator_y = acc_y;
+
+                    for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
+                        auto& add_state = row.add_state[m];
+                        bool add_predicate = (m == 0 ? (j != 0 || k != 0) : add_state.add);
+
+                        const auto& inverse = inverse_trace[trace_index];
+                        const auto& p1 = p1_trace[trace_index];
+                        const auto& p2 = p2_trace[trace_index];
+                        add_state.collision_inverse = add_predicate ? inverse : 0;
+                        add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
+                        trace_index++;
+                    }
+                    accumulator_index++;
+                    msm_row_index++;
+                }
+
+                if (j < num_rounds - 1) {
+                    MSMState& row = msm_state[msm_row_index];
+                    const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+                    const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                    const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                    row.accumulator_x = acc_x;
+                    row.accumulator_y = acc_y;
+
+                    for (size_t m = 0; m < 4; ++m) {
+                        auto& add_state = row.add_state[m];
+                        add_state.collision_inverse = 0;
+                        const FF& dx = p1_trace[trace_index].x;
+                        const FF& inverse = inverse_trace[trace_index];
+                        add_state.lambda = ((dx + dx + dx) * dx) * inverse;
+                        trace_index++;
+                    }
+                    accumulator_index++;
+                    msm_row_index++;
+                } else {
+                    for (size_t k = 0; k < rows_per_round; ++k) {
+                        MSMState& row = msm_state[msm_row_index];
+                        const Element& normalized_accumulator = accumulator_trace[accumulator_index];
+
+                        const size_t idx = k * ADDITIONS_PER_ROW;
+
+                        const FF& acc_x = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.x;
+                        const FF& acc_y = normalized_accumulator.is_point_at_infinity() ? 0 : normalized_accumulator.y;
+                        row.accumulator_x = acc_x;
+                        row.accumulator_y = acc_y;
+
+                        for (size_t m = 0; m < ADDITIONS_PER_ROW; ++m) {
+                            auto& add_state = row.add_state[m];
+                            bool add_predicate = add_state.add ? msm[idx + m].wnaf_skew : false;
+
+                            const auto& inverse = inverse_trace[trace_index];
+                            const auto& p1 = p1_trace[trace_index];
+                            const auto& p2 = p2_trace[trace_index];
+                            add_state.collision_inverse = add_predicate ? inverse : 0;
+                            add_state.lambda = add_predicate ? (p2.y - p1.y) * inverse : 0;
+                            trace_index++;
                         }
+                        accumulator_index++;
+                        msm_row_index++;
                     }
                 }
-            },
-            HACK_TO_FIX_VANILLA);
+            }
+        }
 
         // populate the final row in the MSM execution trace.
         // we always require 1 extra row at the end of the trace, because the accumulator x/y coordinates for row `i`

From 78055b0cbd7ec31ce4b2793ea9f8bb8bd11eee11 Mon Sep 17 00:00:00 2001
From: codygunton <codygunton@gmail.com>
Date: Mon, 6 May 2024 17:35:39 +0000
Subject: [PATCH 6/6] Add needed escape

---
 .../cpp/scripts/compare_branch_vs_baseline_remote.sh        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
index d28cc0a4972..27d1af8966a 100755
--- a/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
+++ b/barretenberg/cpp/scripts/compare_branch_vs_baseline_remote.sh
@@ -35,8 +35,9 @@ echo -e "\nRunning benchmark in feature branch.."
                               "./$BENCHMARK --benchmark_filter=$FILTER\
                                             --benchmark_out=results_after.json\
                                             --benchmark_out_format=json"\
-                              $PRESET
+                              $PRESET\
                               $BUILD_DIR
+
 scp $BB_SSH_KEY $BB_SSH_INSTANCE:$BB_SSH_CPP_PATH/build/results_after.json $BUILD_DIR/
 
 # Configure and build benchmark in $BASELINE branch
@@ -51,8 +52,9 @@ echo -e "\nRunning benchmark in feature branch.."
                               "./$BENCHMARK --benchmark_filter=$FILTER\
                                             --benchmark_out=results_before.json\
                                             --benchmark_out_format=json"\
-                              $PRESET
+                              $PRESET\
                               $BUILD_DIR
+
 scp $BB_SSH_KEY $BB_SSH_INSTANCE:$BB_SSH_CPP_PATH/build/results_before.json $BUILD_DIR/
 
 # Call compare.py on the results (json) to get high level statistics.