feat(gpu_prover): implement low VRAM mode (#186)

robik75 · web-flow · commit b5f104c93af4 · 2026-01-20T15:24:49.000+01:00
## What ❔

This PR implements low VRAM mode.

## Why ❔

Low VRAM mode will enable to run the prover on GPUs with VRAM size 24GB
instead if 32GB at the price of a performance penalty.
Depending on the workload an the GPU used the performance penalty can be
in the ballpark of 2-3x.
The low VRAM mode can be enabled by setting the environment variable
`ZKSYNC_AIRBENDER_LOW_VRAM_MODE` to `1` or `true` .

## Is this a breaking change?
- [ ] Yes
- [x] No

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [ ] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted.
diff --git a/gpu_prover/src/execution/gpu_worker.rs b/gpu_prover/src/execution/gpu_worker.rs
@@ -17,12 +17,12 @@ use crate::prover::tracing_data::{InitsAndTeardownsTransfer, TracingDataTransfer
 use crate::witness::trace_unrolled::get_aux_arguments_boundary_values;
 use crossbeam_channel::{Receiver, Sender};
 use era_cudart::device::get_device_properties;
-use log::{debug, error, info, trace};
+use log::{debug, error, info, trace, warn};
 use prover::definitions::AuxArgumentsBoundaryValues;
 use std::ffi::CStr;
-use std::mem;
 use std::ops::Deref;
 use std::process::exit;
+use std::{env, mem};
 use verifier_common::num_queries_for_security_params;
 
 pub fn get_gpu_worker_func(
@@ -52,19 +52,6 @@ enum JobType<'a> {
     Proof(ProofJob<'a>),
 }
 
-fn get_trees_cache_mode(_circuit_type: CircuitType, _context: &ProverContext) -> TreesCacheMode {
-    // match circuit_type {
-    //     CircuitType::Main(main) => match main {
-    //         MainCircuitType::ReducedRiscVLog23Machine if (context.get_mem_size() >> 30) < 28 => {
-    //             TreesCacheMode::CacheNone
-    //         } // less than 28GB
-    //         _ => TreesCacheMode::CacheFull,
-    //     },
-    //     _ => TreesCacheMode::CacheFull,
-    // }
-    TreesCacheMode::CachePatrial
-}
-
 fn gpu_worker(
     device_id: i32,
     prover_context_config: ProverContextConfig,
@@ -73,6 +60,13 @@ fn gpu_worker(
     results: Sender<Option<GpuWorkResult<A>>>,
 ) -> CudaResult<()> {
     trace!("GPU_WORKER[{device_id}] started");
+    // Recompute cosets in low VRAM mode to reduce memory requirement.
+    let recompute_cosets = env::var("ZKSYNC_AIRBENDER_LOW_VRAM_MODE")
+        .map(|s| s == "1" || s.to_lowercase() == "true")
+        .unwrap_or_default();
+    if recompute_cosets {
+        warn!("GPU_WORKER[{device_id}] running in low VRAM mode, this will have negative performance impact");
+    }
     Precomputations::ensure_initialized();
     set_device(device_id)?;
     let props = get_device_properties(device_id)?;
@@ -251,7 +245,6 @@ fn gpu_worker(
                         log_lde_factor as usize,
                     );
                     let pow_bits = verifier_common::POW_BITS as u32;
-                    let trees_cache_mode = get_trees_cache_mode(circuit_type, &context);
                     trace!("BATCH[{batch_id}] GPU_WORKER[{device_id}] producing proof for circuit {circuit_type:?}[{sequence_id}]");
                     let job = prove(
                         circuit_type,
@@ -268,8 +261,8 @@ fn gpu_worker(
                         num_queries,
                         pow_bits,
                         None,
-                        false,
-                        trees_cache_mode,
+                        recompute_cosets,
+                        TreesCacheMode::CachePatrial,
                         &context,
                     )?;
                     JobType::Proof(job)