clean up geometry calculation + format

mcarilli · mcarilli · commit 107bce82ce8b · 2026-01-07T18:49:26.000-07:00
diff --git a/gpu_prover/native/ntt/natural_evals_to_bitrev_Z_radix_8.cu b/gpu_prover/native/ntt/natural_evals_to_bitrev_Z_radix_8.cu
@@ -5,7 +5,8 @@ namespace airbender::ntt {
 EXTERN __launch_bounds__(256, 3) __global__
     void ab_radix_8_main_domain_evals_to_Z_nonfinal_6_stages_warp(vectorized_e2_matrix_getter<ld_modifier::cg> gmem_in,
                                                                   vectorized_e2_matrix_setter<st_modifier::cg> gmem_out, const unsigned start_stage,
-                                                                  unsigned exchg_region_bit_chunks, const unsigned log_n, const unsigned grid_offset) {
+                                                                  unsigned exchg_region_bit_chunks, const unsigned log_exchg_region_size,
+                                                                  const unsigned tile_gmem_stride, const unsigned log_n, const unsigned grid_offset) {
   constexpr unsigned WARP_SIZE = 32u;
   constexpr unsigned LOG_RADIX = 3u;
   constexpr unsigned RADIX = 1 << LOG_RADIX;
@@ -25,12 +26,8 @@ EXTERN __launch_bounds__(256, 3) __global__
   const unsigned lane_id = threadIdx.x & 31;
   const unsigned tile_id = lane_id >> LOG_TILE_SIZE;
   const unsigned lane_in_tile = lane_id & TILE_MASK;
-  const unsigned log_exchg_region_size = log_n - start_stage * LOG_RADIX;
-  const unsigned log_tile_gmem_stride = log_exchg_region_size - 2 * LOG_RADIX;
-  const unsigned log_blocks_per_exchg_region = log_tile_gmem_stride - LOG_TILE_SIZE - LOG_WARPS_PER_BLOCK;
-  const unsigned tile_gmem_stride = 1 << log_tile_gmem_stride;
-  const unsigned block_exchg_region = effective_block_idx_x >> log_blocks_per_exchg_region;
-  const unsigned block_in_exchg_region = effective_block_idx_x & ((1 << log_blocks_per_exchg_region) - 1);
+  const unsigned block_exchg_region = blockIdx.x;    // effective_block_idx_x >> log_blocks_per_exchg_region;
+  const unsigned block_in_exchg_region = blockIdx.y; // effective_block_idx_x & ((1 << log_blocks_per_exchg_region) - 1);
   const unsigned gmem_block_offset = block_exchg_region << log_exchg_region_size;
   const unsigned gmem_warp_offset = ((block_in_exchg_region << LOG_WARPS_PER_BLOCK) + warp_id) << LOG_TILE_SIZE;
   gmem_in.add_row(gmem_block_offset + gmem_warp_offset);
@@ -61,11 +58,11 @@ EXTERN __launch_bounds__(256, 3) __global__
       smem[addr] = vals0[i];
       smem[addr + WARP_SIZE] = vals1[i];
     }
-// #pragma unroll
-//     for (unsigned i{0}, addr{thread_offset}; i < RADIX; i++, addr += RADIX * tile_gmem_stride) {
-//       gmem_out.set_at_row(addr, vals0[i]);
-//       gmem_out.set_at_row(addr + TILES_PER_WARP * tile_gmem_stride, vals1[i]);
-//     }
+    // #pragma unroll
+    //     for (unsigned i{0}, addr{thread_offset}; i < RADIX; i++, addr += RADIX * tile_gmem_stride) {
+    //       gmem_out.set_at_row(addr, vals0[i]);
+    //       gmem_out.set_at_row(addr + TILES_PER_WARP * tile_gmem_stride, vals1[i]);
+    //     }
 
     __syncwarp();
   }
@@ -89,7 +86,7 @@ EXTERN __launch_bounds__(256, 3) __global__
 
     const unsigned gmem_write_offset = lane_in_tile + tile_id * 2 * RADIX * tile_gmem_stride;
 #pragma unroll
-    for (unsigned i{0}, addr{gmem_write_offset}; i < RADIX; i++, addr += tile_gmem_stride ) {
+    for (unsigned i{0}, addr{gmem_write_offset}; i < RADIX; i++, addr += tile_gmem_stride) {
       gmem_out.set_at_row(addr, vals0[i]);
       gmem_out.set_at_row(addr + RADIX * tile_gmem_stride, vals1[i]);
     }
@@ -189,7 +186,7 @@ EXTERN __launch_bounds__(256, 3) __global__
     }
 
     warp_exchg_region_offset *= RADIX;
-    const unsigned exchg_region_0 = warp_exchg_region_offset + tile_id * 2; 
+    const unsigned exchg_region_0 = warp_exchg_region_offset + tile_id * 2;
     const unsigned exchg_region_1 = exchg_region_0 + 1;
     twiddle_stride >>= LOG_RADIX;
     apply_twiddles_distinct_regions<LOG_RADIX>(vals0, vals1, exchg_region_0, exchg_region_1, twiddle_stride, ++exchg_region_bit_chunks);
@@ -218,7 +215,7 @@ EXTERN __launch_bounds__(256, 3) __global__
     }
 
     warp_exchg_region_offset *= RADIX;
-    const unsigned exchg_region_0 = warp_exchg_region_offset + lane_id; 
+    const unsigned exchg_region_0 = warp_exchg_region_offset + lane_id;
     const unsigned exchg_region_1 = exchg_region_0 + 32;
     twiddle_stride >>= LOG_RADIX;
     apply_twiddles_distinct_regions<LOG_RADIX>(vals0, vals1, exchg_region_0, exchg_region_1, twiddle_stride, ++exchg_region_bit_chunks);
diff --git a/gpu_prover/native/ntt/radix_8_utils.cu b/gpu_prover/native/ntt/radix_8_utils.cu
@@ -4,7 +4,7 @@ namespace airbender::ntt {
 
 EXTERN __launch_bounds__(128, 8) __global__
     void ab_bit_reverse_by_radix_8(vectorized_e2_matrix_getter<ld_modifier::cg> src, vectorized_e2_matrix_setter<st_modifier::cg> dst,
-                                 const unsigned bit_chunks, const unsigned log_n) {
+                                   const unsigned bit_chunks, const unsigned log_n) {
   const unsigned n = 1 << log_n;
   const unsigned l_index = blockIdx.x * blockDim.x + threadIdx.x;
   if (l_index >= n)
diff --git a/gpu_prover/native/ntt/radix_8_utils.cuh b/gpu_prover/native/ntt/radix_8_utils.cuh
@@ -11,17 +11,17 @@ DEVICE_FORCEINLINE void size_8_fwd_dit(e2f *x) {
   // first stage
 #pragma unroll
   for (unsigned i{0}; i < 4; i++) {
-      const e2f tmp = x[i];
-      x[i] = e2f::add(tmp, x[i + 4]);
-      x[i + 4] = e2f::sub(tmp, x[i + 4]);
+    const e2f tmp = x[i];
+    x[i] = e2f::add(tmp, x[i + 4]);
+    x[i + 4] = e2f::sub(tmp, x[i + 4]);
   }
 
   // second stage
 #pragma unroll
   for (unsigned i{0}; i < 2; i++) {
-      const e2f tmp = x[i];
-      x[i] = e2f::add(tmp, x[i + 2]);
-      x[i + 2] = e2f::sub(tmp, x[i + 2]);
+    const e2f tmp = x[i];
+    x[i] = e2f::add(tmp, x[i + 2]);
+    x[i + 2] = e2f::sub(tmp, x[i + 2]);
   }
   // x[4] = x[4] + W_1_4 * (x[6].real + i * x[6].imag)
   //      = x[4] + (-i) * (x[6].real + i * x[6].imag)
@@ -31,19 +31,19 @@ DEVICE_FORCEINLINE void size_8_fwd_dit(e2f *x) {
   //      = x[4] + (-x[6].imag + i * x[6].real)
 #pragma unroll
   for (unsigned i{4}; i < 6; i++) {
-      const e2f tmp0 = x[i];
-      x[i][0] = bf::add(x[i][0], x[i + 2][1]);
-      x[i][1] = bf::sub(x[i][1], x[i + 2][0]);
-      const bf tmp1 = x[i + 2][0];
-      x[i + 2][0] = bf::sub(tmp0[0], x[i + 2][1]);
-      x[i + 2][1] = bf::add(tmp0[1], tmp1);
+    const e2f tmp0 = x[i];
+    x[i][0] = bf::add(x[i][0], x[i + 2][1]);
+    x[i][1] = bf::sub(x[i][1], x[i + 2][0]);
+    const bf tmp1 = x[i + 2][0];
+    x[i + 2][0] = bf::sub(tmp0[0], x[i + 2][1]);
+    x[i + 2][1] = bf::add(tmp0[1], tmp1);
   }
 
   // third stage
   {
     // x[3] = W_1_4 * x[3]
     //      = -i * (x[3].real + i * x[3].imag)
-    //      = x[3].imag - i * x[3].real) 
+    //      = x[3].imag - i * x[3].real
     const bf tmp = x[3][0];
     x[3][0] = x[3][1];
     x[3][1] = bf::neg(tmp);
@@ -52,9 +52,9 @@ DEVICE_FORCEINLINE void size_8_fwd_dit(e2f *x) {
   x[7] = e2f::mul(W_3_8, x[7]); // don't bother optimizing, marginal gains
 #pragma unroll
   for (unsigned i{0}; i < 8; i += 2) {
-      const e2f tmp = x[i];
-      x[i] = e2f::add(tmp, x[i + 1]);
-      x[i + 1] = e2f::sub(tmp, x[i + 1]);
+    const e2f tmp = x[i];
+    x[i] = e2f::add(tmp, x[i + 1]);
+    x[i + 1] = e2f::sub(tmp, x[i + 1]);
   }
 
   // undo bitrev
@@ -74,17 +74,17 @@ DEVICE_FORCEINLINE void size_8_inv_dit(e2f *x) {
   // first stage
 #pragma unroll
   for (unsigned i{0}; i < 4; i++) {
-      const e2f tmp = x[i];
-      x[i] = e2f::add(tmp, x[i + 4]);
-      x[i + 4] = e2f::sub(tmp, x[i + 4]);
+    const e2f tmp = x[i];
+    x[i] = e2f::add(tmp, x[i + 4]);
+    x[i + 4] = e2f::sub(tmp, x[i + 4]);
   }
 
   // second stage
 #pragma unroll
   for (unsigned i{0}; i < 2; i++) {
-      const e2f tmp = x[i];
-      x[i] = e2f::add(tmp, x[i + 2]);
-      x[i + 2] = e2f::sub(tmp, x[i + 2]);
+    const e2f tmp = x[i];
+    x[i] = e2f::add(tmp, x[i + 2]);
+    x[i + 2] = e2f::sub(tmp, x[i + 2]);
   }
   // x[4] = x[4] + W_1_4_INV * (x[6].real + i * x[6].imag)
   //      = x[4] + i * (x[6].real + i * x[6].imag)
@@ -94,19 +94,19 @@ DEVICE_FORCEINLINE void size_8_inv_dit(e2f *x) {
   //      = x[4] + (x[6].imag - i * x[6].real)
 #pragma unroll
   for (unsigned i{4}; i < 6; i++) {
-      const e2f tmp0 = x[i];
-      x[i][0] = bf::sub(x[i][0], x[i + 2][1]);
-      x[i][1] = bf::add(x[i][1], x[i + 2][0]);
-      const bf tmp1 = x[i + 2][0];
-      x[i + 2][0] = bf::add(tmp0[0], x[i + 2][1]);
-      x[i + 2][1] = bf::sub(tmp0[1], tmp1);
+    const e2f tmp0 = x[i];
+    x[i][0] = bf::sub(x[i][0], x[i + 2][1]);
+    x[i][1] = bf::add(x[i][1], x[i + 2][0]);
+    const bf tmp1 = x[i + 2][0];
+    x[i + 2][0] = bf::add(tmp0[0], x[i + 2][1]);
+    x[i + 2][1] = bf::sub(tmp0[1], tmp1);
   }
 
   // third stage
   {
     // x[3] = W_1_4_INV * x[3]
     //      = i * (x[3].real + i * x[3].imag)
-    //      = -x[3].imag + i * x[3].real) 
+    //      = -x[3].imag + i * x[3].real)
     const bf tmp = x[3][0];
     x[3][0] = bf::neg(x[3][1]);
     x[3][1] = tmp;
@@ -115,9 +115,9 @@ DEVICE_FORCEINLINE void size_8_inv_dit(e2f *x) {
   x[7] = e2f::mul(W_3_8_INV, x[7]); // don't bother optimizing, marginal gains
 #pragma unroll
   for (unsigned i{0}; i < 8; i += 2) {
-      const e2f tmp = x[i];
-      x[i] = e2f::add(tmp, x[i + 1]);
-      x[i + 1] = e2f::sub(tmp, x[i + 1]);
+    const e2f tmp = x[i];
+    x[i] = e2f::add(tmp, x[i + 1]);
+    x[i + 1] = e2f::sub(tmp, x[i + 1]);
   }
 
   // undo bitrev
@@ -129,8 +129,7 @@ DEVICE_FORCEINLINE void size_8_inv_dit(e2f *x) {
   x[6] = tmp1;
 }
 
-template <unsigned LOG_RADIX>
-DEVICE_FORCEINLINE unsigned bitrev_by_radix(const unsigned idx, const unsigned bit_chunks) {
+template <unsigned LOG_RADIX> DEVICE_FORCEINLINE unsigned bitrev_by_radix(const unsigned idx, const unsigned bit_chunks) {
   constexpr unsigned RADIX_MASK = (1 << LOG_RADIX) - 1;
   unsigned out{0}, tmp_idx{idx};
   for (unsigned i{0}; i < bit_chunks; i++) {
@@ -152,7 +151,7 @@ DEVICE_FORCEINLINE void apply_twiddles_same_region(e2f *vals0, e2f *vals1, const
       const auto twiddle = get_twiddle_with_direct_index<true>(v * i * twiddle_stride);
       vals0[i] = e2f::mul(vals0[i], twiddle);
       vals1[i] = e2f::mul(vals1[i], twiddle);
-    }  
+    }
   }
 }
 
@@ -166,15 +165,15 @@ DEVICE_FORCEINLINE void apply_twiddles_distinct_regions(e2f *vals0, e2f *vals1,
     for (unsigned i{1}; i < RADIX; i++) {
       const auto twiddle = get_twiddle_with_direct_index<true>(v * i * twiddle_stride);
       vals0[i] = e2f::mul(vals0[i], twiddle);
-    }  
+    }
   }
   // exchg_region_1 should never be 0
   const unsigned v = bitrev_by_radix<LOG_RADIX>(exchg_region_1, idx_bit_chunks);
 #pragma unroll
   for (unsigned i{1}; i < RADIX; i++) {
     const auto twiddle = get_twiddle_with_direct_index<true>(v * i * twiddle_stride);
     vals1[i] = e2f::mul(vals1[i], twiddle);
-  }  
+  }
 }
 
-} // namespace airbender::ntt1
+} // namespace airbender::ntt
diff --git a/gpu_prover/src/ntt/mod.rs b/gpu_prover/src/ntt/mod.rs
@@ -8,7 +8,7 @@ pub mod tests;
 use era_cudart::cuda_kernel;
 use era_cudart::error::get_last_error;
 use era_cudart::event::{CudaEvent, CudaEventCreateFlags};
-use era_cudart::execution::{CudaLaunchConfig, KernelFunction};
+use era_cudart::execution::{CudaLaunchConfig, Dim3, KernelFunction};
 use era_cudart::result::{CudaResult, CudaResultWrap};
 use era_cudart::slice::DeviceSlice;
 use era_cudart::stream::{CudaStream, CudaStreamWaitEventFlags};
@@ -115,6 +115,21 @@ n2b_multi_stage_kernel!(ab_compressed_coset_evals_to_Z_final_7_stages_warp);
 n2b_multi_stage_kernel!(ab_compressed_coset_evals_to_Z_final_8_stages_warp);
 n2b_multi_stage_kernel!(ab_compressed_coset_evals_to_Z_final_9_to_12_stages_block);
 
+cuda_kernel!(
+    N2BRadix8Nonfinal,
+    n2b_radix_8_nonfinal_kernel,
+    inputs_matrix: PtrAndStride<BF>,
+    outputs_matrix: MutPtrAndStride<BF>,
+    start_stage: u32,
+    idx_bit_chunks: u32,
+    log_exchg_region_size: u32,
+    tile_gmem_stride: u32,
+    log_n: u32,
+    grid_offset: u32,
+);
+
+n2b_radix_8_nonfinal_kernel!(ab_radix_8_main_domain_evals_to_Z_nonfinal_6_stages_warp);
+
 cuda_kernel!(
     N2BRadix8,
     n2b_radix_8_kernel,
@@ -126,7 +141,6 @@ cuda_kernel!(
     grid_offset: u32,
 );
 
-n2b_radix_8_kernel!(ab_radix_8_main_domain_evals_to_Z_nonfinal_6_stages_warp);
 n2b_radix_8_kernel!(ab_radix_8_main_domain_evals_to_Z_final_12_stages_block);
 
 pub fn bit_reverse_by_radix_8(
@@ -156,6 +170,24 @@ pub fn bit_reverse_by_radix_8(
     BitReverseByRadix8Function(ab_bit_reverse_by_radix_8).launch(&config, &args)
 }
 
+fn get_noninitial_grid_helpers(log_n: usize, start_stage: usize) -> (usize, usize, Dim3) {
+    const LOG_RADIX: usize = 3;
+    const LOG_TILE_SIZE: usize = 3;
+    const LOG_WARPS_PER_BLOCK: usize = 3;
+    let log_exchg_region_size = log_n - start_stage * LOG_RADIX;
+    let log_tile_gmem_stride = log_exchg_region_size - 2 * LOG_RADIX;
+    let log_blocks_per_exchg_region = log_tile_gmem_stride - LOG_TILE_SIZE - LOG_WARPS_PER_BLOCK;
+    let tile_gmem_stride = 1 << log_tile_gmem_stride;
+    let num_exchg_regions = 1 << (log_n - log_exchg_region_size);
+    let mut block_dims: Dim3 = (num_exchg_regions as u32).into();
+    block_dims.y = 1 << log_blocks_per_exchg_region as u32;
+    assert_eq!(
+        block_dims.x * block_dims.y,
+        (1 << log_n).get_chunks_count(4096)
+    );
+    (log_exchg_region_size, tile_gmem_stride, block_dims)
+}
+
 pub fn natural_evals_to_bitrev_Z_radix_8(
     inputs_matrix: &(impl DeviceMatrixChunkImpl<BF> + ?Sized),
     outputs_matrix: &mut (impl DeviceMatrixChunkMutImpl<BF> + ?Sized),
@@ -174,31 +206,39 @@ pub fn natural_evals_to_bitrev_Z_radix_8(
     let outputs_matrix_const = outputs_matrix.as_ptr_and_stride();
     let outputs_matrix_mut = outputs_matrix.as_mut_ptr_and_stride();
     let threads = 256;
-    let blocks = n.get_chunks_count(4096);
-    let config = CudaLaunchConfig::basic(blocks as u32, threads as u32, stream);
-    let args = N2BRadix8Arguments::new(
+    let (log_exchg_region_size, tile_gmem_stride, block_dims) =
+        get_noninitial_grid_helpers(log_n, start_stage);
+    let config = CudaLaunchConfig::basic(block_dims, threads as u32, stream);
+    let args = N2BRadix8NonfinalArguments::new(
         inputs_matrix,
         outputs_matrix_mut,
         start_stage as u32,
         exchg_region_bit_chunks as u32,
+        log_exchg_region_size as u32,
+        tile_gmem_stride as u32,
         log_n as u32,
         0,
     );
-    N2BRadix8Function(ab_radix_8_main_domain_evals_to_Z_nonfinal_6_stages_warp).launch(&config, &args)?;
+    N2BRadix8NonfinalFunction(ab_radix_8_main_domain_evals_to_Z_nonfinal_6_stages_warp)
+        .launch(&config, &args)?;
     start_stage += 2;
     exchg_region_bit_chunks += 2;
     let threads = 256;
-    let blocks = n.get_chunks_count(4096);
-    let config = CudaLaunchConfig::basic(blocks as u32, threads as u32, stream);
-    let args = N2BRadix8Arguments::new(
+    let (log_exchg_region_size, tile_gmem_stride, block_dims) =
+        get_noninitial_grid_helpers(log_n, start_stage);
+    let config = CudaLaunchConfig::basic(block_dims, threads as u32, stream);
+    let args = N2BRadix8NonfinalArguments::new(
         outputs_matrix_const,
         outputs_matrix_mut,
         start_stage as u32,
         exchg_region_bit_chunks as u32,
+        log_exchg_region_size as u32,
+        tile_gmem_stride as u32,
         log_n as u32,
         0,
     );
-    N2BRadix8Function(ab_radix_8_main_domain_evals_to_Z_nonfinal_6_stages_warp).launch(&config, &args)?;
+    N2BRadix8NonfinalFunction(ab_radix_8_main_domain_evals_to_Z_nonfinal_6_stages_warp)
+        .launch(&config, &args)?;
     start_stage += 2;
     exchg_region_bit_chunks += 2;
     let threads = 256;
@@ -212,7 +252,8 @@ pub fn natural_evals_to_bitrev_Z_radix_8(
         log_n as u32,
         0,
     );
-    N2BRadix8Function(ab_radix_8_main_domain_evals_to_Z_final_12_stages_block).launch(&config, &args)
+    N2BRadix8Function(ab_radix_8_main_domain_evals_to_Z_final_12_stages_block)
+        .launch(&config, &args)
 }
 
 #[allow(clippy::too_many_arguments)]
diff --git a/gpu_prover/src/ntt/tests.rs b/gpu_prover/src/ntt/tests.rs