feat(circuit_prover): Add circuit prover (#2908)

### Summary This PR introduces a new component `circuit_prover`, which is multiple WVGs & GPU prover running together, groupless. The changes are non-destructive, old setup and new setup must work together in-tandem. ### What? Circuit prover is a component that runs X WVGs alongside a GPU prover. Making full use of CPUs on the GPU machine, WVGs as a component can be removed altogether. Prover groups are not needed anymore. Based on empiric testing we can (almost -- there will be follow-up fixes to make it efficient) fully run everything on a single machine. The current implementation can sunset the old setup. Current metrics show that circuit prover is > 60% efficient than old one (but quirks are needed for node proofs to unlock it -- will be treated as a follow-up). The purpose is to have the `circuit_prover` deprecate old `prover_fri` & `witness_vector_generator`. ### Why? The changes will allow us to reduce our infrastructure footprint by ~2x and fix plenty of issues we had in the past. Namely: - fully decoupled of GCP - better resource utilization & reduce costs - reduce overall infrastructure needs (which solves the GPU unavailability we've been facing) - reduce complexity & other inefficiencies (no more prover groups!) - and more ### Ask We want to unblock folks running on AWS. This PR is done as is to speed up release process on DevOps side, as it's the longest pole. NOTE: This is the first PR out of a longer set of PRs. Comments are more than welcome, but the following concerns will be addressed in follow-up PRs and are out of scope for this PR: - tracing implementation is subpar; in fact, I'm confident that most metrics could be done via traces - there's a lot of code duplication (both old/new prover, but runner interface between new WVG & CP) - tests - concern separation between job scheduling & job execution - job priority based on resource consumption - other nits (such as no README, constants being hard-coded instead of configuration, etc.) ### Reviewer suggestion This is basically a merge between `prover_fri`, `witness_vector_generation` and `JobProcessor`. Checking code alongside should give you a better view of what's going on. Sorry for making this hard. :/
matter-labs · Sep 20, 2024 · 48317e6 · 48317e6
1 parent 1cf959d
commit 48317e6
Show file tree

Hide file tree

Showing 26 changed files with 1,484 additions and 20 deletions.
diff --git a/.github/workflows/ci-common-reusable.yml b/.github/workflows/ci-common-reusable.yml
@@ -22,6 +22,7 @@ jobs:
           echo "SCCACHE_GCS_SERVICE_ACCOUNT=gha-ci-runners@matterlabs-infra.iam.gserviceaccount.com" >> .env
           echo "SCCACHE_GCS_RW_MODE=READ_WRITE" >> .env
           echo "RUSTC_WRAPPER=sccache" >> .env
+          echo "RUSTFLAGS=--cfg=no_cuda" >> .env
 
       - name: Start services
         run: |

diff --git a/.github/workflows/ci-core-lint-reusable.yml b/.github/workflows/ci-core-lint-reusable.yml
@@ -19,6 +19,7 @@ jobs:
           echo "SCCACHE_GCS_SERVICE_ACCOUNT=gha-ci-runners@matterlabs-infra.iam.gserviceaccount.com" >> .env
           echo "SCCACHE_GCS_RW_MODE=READ_WRITE" >> .env
           echo "RUSTC_WRAPPER=sccache" >> .env
+          echo "RUSTFLAGS=--cfg=no_cuda" >> .env
           echo "prover_url=postgres://postgres:notsecurepassword@localhost:5432/zksync_local_prover" >> $GITHUB_ENV
           echo "core_url=postgres://postgres:notsecurepassword@localhost:5432/zksync_local" >> $GITHUB_ENV
 

diff --git a/.github/workflows/ci-prover-reusable.yml b/.github/workflows/ci-prover-reusable.yml
@@ -57,6 +57,7 @@ jobs:
           echo "SCCACHE_GCS_SERVICE_ACCOUNT=gha-ci-runners@matterlabs-infra.iam.gserviceaccount.com" >> .env
           echo "SCCACHE_GCS_RW_MODE=READ_WRITE" >> .env
           echo "RUSTC_WRAPPER=sccache" >> .env
+          echo "RUSTFLAGS=--cfg=no_cuda" >> .env
 
       - name: Start services
         run: |

diff --git a/core/lib/basic_types/src/prover_dal.rs b/core/lib/basic_types/src/prover_dal.rs
@@ -9,7 +9,7 @@ use crate::{
     basic_fri_types::AggregationRound, protocol_version::ProtocolVersionId, L1BatchNumber,
 };
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub struct FriProverJobMetadata {
     pub id: u32,
     pub block_number: L1BatchNumber,

diff --git a/docker/witness-generator/Dockerfile b/docker/witness-generator/Dockerfile
@@ -1,7 +1,7 @@
 FROM ghcr.io/matter-labs/zksync-build-base:latest AS builder
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG RUST_FLAGS=""
+ARG RUST_FLAGS="--cfg=no_cuda"
 ENV RUSTFLAGS=${RUST_FLAGS}
 
 # set of args for use of sccache

diff --git a/docker/witness-vector-generator/Dockerfile b/docker/witness-vector-generator/Dockerfile
@@ -1,6 +1,8 @@
 FROM ghcr.io/matter-labs/zksync-build-base:latest AS builder
 
 ARG DEBIAN_FRONTEND=noninteractive
+ARG RUST_FLAGS="--cfg=no_cuda"
+ENV RUSTFLAGS=${RUST_FLAGS}
 
 # set of args for use of sccache
 ARG SCCACHE_GCS_BUCKET=""

diff --git a/docs/guides/setup-dev.md b/docs/guides/setup-dev.md
@@ -48,6 +48,10 @@ cargo install sqlx-cli --version 0.8.1
 # Foundry
 curl -L https://foundry.paradigm.xyz | bash
 foundryup --branch master
+
+# Non GPU setup, can be skipped if the machine has a GPU configured for provers
+echo "export RUSTFLAGS='--cfg=no_cuda'" >> ~/.bashrc
+
 # You will need to reload your `*rc` file here
 
 # Clone the repo to the desired location
@@ -237,6 +241,28 @@ Go to the zksync folder and run `nix develop`. After it finishes, you are in a s
 [Foundry](https://book.getfoundry.sh/getting-started/installation) can be utilized for deploying smart contracts. For
 commands related to deployment, you can pass flags for Foundry integration.
 
+## Non-GPU setup
+
+Circuit Prover requires a GPU (& CUDA bindings) to run. If you still want to be able to build everything locally on
+non-GPU setup, you'll need to change your rustflags.
+
+For a single run, it's enough to export it on the shell:
+
+```
+export RUSTFLAGS='--cfg=no_cuda'
+```
+
+For persistent runs, you can either echo it in your ~/.<shell>rc file (discouraged), or configure it for your taste in
+`config.toml`.
+
+For project level configuration, edit `/path/to/zksync/.cargo/config.toml`. For global cargo setup,
+`~/.cargo/config.toml`. Add the following:
+
+```toml
+[build]
+rustflags = ["--cfg=no_cuda"]
+```
+
 ## Environment
 
 Edit the lines below and add them to your shell profile file (e.g. `~/.bash_profile`, `~/.zshrc`):

diff --git a/prover/Cargo.lock b/prover/Cargo.lock
diff --git a/prover/Cargo.toml b/prover/Cargo.toml
@@ -51,6 +51,7 @@ structopt = "0.3.26"
 strum = { version = "0.26" }
 tempfile = "3"
 tokio = "1"
+tokio-util = "0.7.11"
 toml_edit = "0.14.4"
 tracing = "0.1"
 tracing-subscriber = "0.3"

diff --git a/prover/crates/bin/circuit_prover/Cargo.toml b/prover/crates/bin/circuit_prover/Cargo.toml
@@ -0,0 +1,38 @@
+[package]
+name = "zksync_circuit_prover"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+repository.workspace = true
+license.workspace = true
+keywords.workspace = true
+categories.workspace = true
+
+[dependencies]
+tokio = { workspace = true, features = ["macros", "time"] }
+tokio-util.workspace = true
+anyhow.workspace = true
+async-trait.workspace = true
+tracing.workspace = true
+bincode.workspace = true
+clap = { workspace = true, features = ["derive"] }
+
+zksync_config.workspace = true
+zksync_object_store.workspace = true
+zksync_prover_dal.workspace = true
+zksync_prover_fri_types.workspace = true
+zksync_prover_fri_utils.workspace = true
+zksync_queued_job_processor.workspace = true
+zksync_types.workspace = true
+zksync_prover_keystore = { workspace = true, features = ["gpu"] }
+zksync_env_config.workspace = true
+zksync_core_leftovers.workspace = true
+zksync_utils.workspace = true
+
+vise.workspace = true
+shivini = { workspace = true, features = [
+    "circuit_definitions",
+    "zksync",
+] }
+zkevm_test_harness.workspace = true
diff --git a/prover/crates/bin/circuit_prover/src/backoff.rs b/prover/crates/bin/circuit_prover/src/backoff.rs
@@ -0,0 +1,39 @@
+use std::{ops::Mul, time::Duration};
+
+/// Backoff - convenience structure that takes care of backoff timings.
+#[derive(Debug, Clone)]
+pub struct Backoff {
+    base_delay: Duration,
+    current_delay: Duration,
+    max_delay: Duration,
+}
+
+impl Backoff {
+    /// The delay multiplication coefficient.
+    // Currently it's hardcoded, but could be provided in the constructor.
+    const DELAY_MULTIPLIER: u32 = 2;
+
+    /// Create a backoff with base_delay (first delay) and max_delay (maximum delay possible).
+    pub fn new(base_delay: Duration, max_delay: Duration) -> Self {
+        Backoff {
+            base_delay,
+            current_delay: base_delay,
+            max_delay,
+        }
+    }
+
+    /// Get current delay, handling future delays if needed
+    pub fn delay(&mut self) -> Duration {
+        let delay = self.current_delay;
+        self.current_delay = self
+            .current_delay
+            .mul(Self::DELAY_MULTIPLIER)
+            .min(self.max_delay);
+        delay
+    }
+
+    /// Reset the backoff time for to base delay
+    pub fn reset(&mut self) {
+        self.current_delay = self.base_delay;
+    }
+}